In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from collections import Counter
from imblearn.under_sampling import TomekLinks
from collections import Counter
import string

## Import data

In [None]:
path='data/mono_multi/'

In [None]:
#data import
train_data_ecco=pd.read_csv(path+'ecco_train.csv')
train_data_ecco['ecco_full_title']=train_data_ecco['ecco_full_title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()
test_data_ecco=pd.read_csv(path+'ecco_test.csv')
test_data_ecco['ecco_full_title']=test_data_ecco['ecco_full_title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()
train_data_caa=pd.read_csv(path+'caa_train_df.csv')
train_data_caa['title']=train_data_caa['title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()
test_data_caa=pd.read_csv(path+'caa_test_df.csv')
test_data_caa['title']=test_data_caa['title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()
balanced_train_data_caa=pd.read_csv(path+'balanced_caa_train_df.csv')
balanced_train_data_caa['title']=balanced_train_data_caa['title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()
balanced_test_data_caa=pd.read_csv(path+'balanced_caa_test_df.csv')
balanced_test_data_caa['title']=balanced_test_data_caa['title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()
combined_train_data=pd.read_csv(path+'combined_train.csv')
combined_train_data['title']=combined_train_data['title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()
combined_test_data=pd.read_csv(path+'combined_test.csv')
combined_test_data['title']=combined_test_data['title'].str.translate(str.maketrans("", "", string.punctuation)).str.lower()

In [None]:
combined_ecco=pd.concat([train_data_ecco,test_data_ecco])

In [None]:
# Concatenate test and train data
combined_test_train = pd.concat([combined_test_data, combined_train_data])

# Tokenize the text and remove punctuation
def preprocess_text(text):
    # Remove punctuation using string.punctuation
    cleaned_text = text.translate(str.maketrans("", "", string.punctuation))
    return [word for word in cleaned_text.lower().split()]

tokens_cleaned = combined_test_train['title'].apply(preprocess_text)

# Calculate word frequencies
word_counts = Counter(word for sublist in tokens_cleaned for word in sublist)

# Initialize presence_counts dictionary
presence_counts = {}

# Check if the word is not in word_counts and calculate presence count
for sublist in tokens_cleaned:
    for word in sublist:
        if word not in word_counts:
            presence_counts[word] = len(combined_test_train[combined_test_train['title'].str.lower().apply(lambda x: word in x)])

# Filter based on frequency and proportion
n_titles = len(combined_test_train)
common_vocabulary = [word for word, count in word_counts.items() if 1 < count <= 0.8 * n_titles and word not in presence_counts]

# Train on ECCO, test on ECCO for monolingual vs multilingual

In [None]:
train_data=train_data_ecco
id_field="estc_id"
source_field="ecco_full_title"
target_field="monolingual"

In [None]:
x_train=train_data[['estc_id', 'ecco_full_title']]
y_train=train_data[target_field]

In [None]:
# Define a pipeline to search for the best combination of TFidf vectorizer and logistic regression

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(vocabulary=common_vocabulary, lowercase=True)),
    ('clf', LinearSVC())
])

# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)],
    'clf__penalty': ['l1', 'l2', 'elasticnet', None],
    'clf__class_weight': [None, 'balanced'],
}
search = GridSearchCV(pipeline, param_grid)
search.fit(x_train[source_field], y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
clf__class_weight='balanced'
clf__penalty= 'l2' 
tfidf__ngram_range= (1, 1)

## I now do the prediction based on the best hyperparameters

In [None]:
text_transformer = TfidfVectorizer(vocabulary=common_vocabulary, ngram_range=tfidf__ngram_range, lowercase=True, max_features=150000)
clf= LinearSVC(class_weight=clf__class_weight, penalty=clf__penalty)

In [None]:
x_test=test_data_ecco[[id_field, source_field]]
y_test=test_data_ecco[target_field]

In [None]:
X_train_text = text_transformer.fit_transform(x_train[source_field])
X_test_text = text_transformer.transform(x_test[source_field])

In [None]:
clf.fit(X_train_text, y_train)
test_preds = clf.predict(X_test_text)

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
accuracy_score(y_test, test_preds)

## I now do the prediction on caa

In [None]:
id_field="RecordID"
source_field="title"
target_field="monolingual"

In [None]:
x_test=test_data_caa[[id_field, source_field]]
y_test=test_data_caa[target_field]
x_test_balanced=balanced_test_data_caa[[id_field, source_field]]
y_test_balanced=balanced_test_data_caa[target_field]

In [None]:
X_test_caa = text_transformer.transform(x_test[source_field])
X_balanced_test_caa=text_transformer.transform(x_test_balanced[source_field])

In [None]:
y_preds_caa = clf.predict(X_test_caa)
y_preds_balanced=clf.predict(X_balanced_test_caa)

In [None]:
print(classification_report(y_test, y_preds_caa))

In [None]:
print(classification_report(y_test_balanced, y_preds_balanced))

In [None]:
y_preds_balanced

In [None]:
accuracy_score(y_test, y_preds_caa)

In [None]:
accuracy_score(y_test_balanced, y_preds_balanced)

## I test on the combined dataset

In [None]:
id_field="id"
source_field="title"
target_field="monolingual"

In [None]:
x_test=combined_test_data[[id_field, source_field]]
y_test=combined_test_data[target_field]

In [None]:
X_test = text_transformer.transform(x_test[source_field])
y_preds = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
accuracy_score(y_test, y_preds)

## I train and predict on CAA

In [None]:
train_data=train_data_caa

In [None]:
id_field="RecordID"
source_field="title"
target_field="monolingual"
x_train=train_data[[id_field, source_field]]
y_train=train_data[target_field]

In [None]:
# Define a pipeline to search for the best combination of TFidf vectorizer and logistic regression

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(vocabulary=common_vocabulary, lowercase=True)),
    ('clf', LinearSVC())
])

# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)],
    'clf__penalty': ['l1', 'l2', 'elasticnet', None],
    'clf__class_weight': [None, 'balanced'],
}
search = GridSearchCV(pipeline, param_grid)
search.fit(x_train[source_field], y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.978):
{'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'tfidf__max_df': 0.7, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}

In [None]:
clf__class_weight='balanced'
clf__penalty= 'l2' 
tfidf__ngram_range= (1, 1)

### I now do the prediction based on the best hyperparameters

In [None]:
text_transformer = TfidfVectorizer(vocabulary=common_vocabulary, ngram_range=tfidf__ngram_range, lowercase=True, max_features=150000)
clf= LinearSVC(class_weight=clf__class_weight, penalty=clf__penalty)

In [None]:
x_test=test_data_caa[[id_field, source_field]]
y_test=test_data_caa[target_field]

In [None]:
X_train_text = text_transformer.fit_transform(x_train[source_field])
X_test_text = text_transformer.transform(x_test[source_field])
clf.fit(X_train_text, y_train)

In [None]:
x_test=test_data_caa[[id_field, source_field]]
y_test=test_data_caa[target_field]
x_test_balanced=balanced_test_data_caa[[id_field, source_field]]
y_test_balanced=balanced_test_data_caa[target_field]

In [None]:
X_test_caa = text_transformer.transform(x_test[source_field])
X_balanced_test_caa=text_transformer.transform(x_test_balanced[source_field])

In [None]:
y_preds_caa = clf.predict(X_test_caa)
y_preds_balanced=clf.predict(X_balanced_test_caa)

In [None]:
print(classification_report(y_test, y_preds_caa))

In [None]:
print(classification_report(y_test_balanced, y_preds_balanced))

In [None]:
accuracy_score(y_test, y_preds_caa)

In [None]:
accuracy_score(y_test_balanced, y_preds_balanced)

### I test on ECCO

In [None]:
id_field="estc_id"
source_field="ecco_full_title"
target_field="monolingual"

In [None]:
x_test=test_data_ecco[[id_field, source_field]]
y_test=test_data_ecco[target_field]

In [None]:
X_test_text = text_transformer.transform(x_test[source_field])

In [None]:
test_preds=clf.predict(X_test_text)

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
accuracy_score(y_test, test_preds)

## I test on the combined dataset

In [None]:
id_field="id"
source_field="title"
target_field="monolingual"

In [None]:
x_test=combined_test_data[[id_field, source_field]]
y_test=combined_test_data[target_field]

In [None]:
X_test = text_transformer.transform(x_test[source_field])
y_preds = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
accuracy_score(y_test, y_preds)


## I train on the combined and test on each one singularly

In [None]:
train_data=combined_train_data

In [None]:
id_field="id"
source_field="title"
target_field="monolingual"

In [None]:
x_train=train_data[[id_field,source_field]]
y_train=train_data[target_field]

In [None]:
# Define a pipeline to search for the best combination of TFidf vectorizer and logistic regression

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(vocabulary=common_vocabulary, lowercase=True)),
    ('clf', LinearSVC())
])

# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)],
    'clf__penalty': ['l1', 'l2', 'elasticnet', None],
    'clf__class_weight': [None, 'balanced'],
}
search = GridSearchCV(pipeline, param_grid)
search.fit(x_train[source_field], y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
clf__class_weight=None
clf__penalty= 'l2' 
tfidf__ngram_range= (1, 1)

## I now do the prediction based on the best hyperparameters

In [None]:
text_transformer = TfidfVectorizer(vocabulary=common_vocabulary, ngram_range=tfidf__ngram_range, lowercase=True, max_features=150000)
clf= LinearSVC(class_weight=clf__class_weight, penalty=clf__penalty)

In [None]:
x_test=combined_test_data[[id_field, source_field]]
y_test=combined_test_data[[target_field, 'source']]

In [None]:
X_train_text = text_transformer.fit_transform(x_train[source_field])
X_test_text = text_transformer.transform(x_test[source_field])

In [None]:
clf.fit(X_train_text, y_train)
test_preds = clf.predict(X_test_text)

In [None]:
print(classification_report(y_test[target_field], test_preds))

In [None]:
accuracy_score(y_test[target_field], test_preds)

### I test on ECCO

In [None]:
id_field="estc_id"
source_field="ecco_full_title"
target_field="monolingual"

In [None]:
x_test=test_data_ecco[[id_field, source_field]]
y_test=test_data_ecco[target_field]

In [None]:
X_test_text = text_transformer.transform(x_test[source_field])
test_preds=clf.predict(X_test_text)

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
accuracy_score(y_test, test_preds)

### I test on unbalanced caa

In [None]:
id_field="RecordID"
source_field="title"
target_field="monolingual"

In [None]:
x_test=test_data_caa[[id_field, source_field]]
y_test=test_data_caa[target_field]

In [None]:
X_test_text = text_transformer.transform(x_test[source_field])
test_preds=clf.predict(X_test_text)

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
accuracy_score(y_test, test_preds)

## I test on balanced CAA

In [None]:
id_field="RecordID"
source_field="title"
target_field="monolingual"

In [None]:
x_test=balanced_test_data_caa[[id_field, source_field]]
y_test=balanced_test_data_caa[target_field]

In [None]:
X_test_text = text_transformer.transform(x_test[source_field])
test_preds=clf.predict(X_test_text)

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
accuracy_score(y_test, test_preds)

## I now redo the pipeline for CAA by sampling the data for more balanced classes

In [None]:
train_data=balanced_train_data_caa

In [None]:
id_field="RecordID"
source_field="title"
target_field="monolingual"

In [None]:
x_train=train_data[[id_field, source_field]]
y_train=train_data[target_field]

In [None]:
# Define a pipeline to search for the best combination of TFidf vectorizer and logistic regression

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(vocabulary=common_vocabulary, lowercase=True)),
    ('clf', LinearSVC())
])

# Parameters of pipelines can be set using '__' separated parameter names:
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)],
    'clf__penalty': ['l1', 'l2', 'elasticnet', None],
    'clf__class_weight': [None, 'balanced'],
}
search = GridSearchCV(pipeline, param_grid)
search.fit(x_train[source_field], y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.978):
{'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'tfidf__max_df': 0.7, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}

In [None]:
clf__class_weight=None
clf__penalty= 'l2' 
tfidf__ngram_range= (1, 1)

## I now do the prediction based on the best hyperparameters

In [None]:
text_transformer = TfidfVectorizer(vocabulary=common_vocabulary, ngram_range=tfidf__ngram_range, lowercase=True, max_features=150000)
clf= LinearSVC(class_weight=clf__class_weight, penalty=clf__penalty)

In [None]:
x_test=balanced_test_data_caa[[id_field, source_field]]
y_test=balanced_test_data_caa[target_field]

In [None]:
X_train_text = text_transformer.fit_transform(x_train[source_field])
X_test_text = text_transformer.transform(x_test[source_field])

In [None]:
clf.fit(X_train_text, y_train)
X_test_caa = text_transformer.transform(x_test[source_field])

In [None]:
y_preds_caa = clf.predict(X_test_caa)

In [None]:
print(classification_report(y_test, y_preds_caa))

In [None]:
accuracy_score(y_test_balanced, y_preds_balanced)

In [None]:
accuracy_score(y_test, y_preds_caa)

## I test on unbalanced caa

In [None]:
id_field="RecordID"
source_field="title"
target_field="monolingual"

In [None]:
x_test=test_data_caa[[id_field, source_field]]
y_test=test_data_caa[target_field]

In [None]:
X_test_text = text_transformer.transform(x_test[source_field])
test_preds=clf.predict(X_test_text)

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
accuracy_score(y_test, test_preds)

## I test on ECCO

In [None]:
id_field="estc_id"
source_field="ecco_full_title"
target_field="monolingual"

In [None]:
x_test=test_data_ecco[[id_field, source_field]]
y_test=test_data_ecco[target_field]

In [None]:
X_test_text = text_transformer.transform(x_test[source_field])
test_preds=clf.predict(X_test_text)

In [None]:
print(classification_report(y_test, test_preds))

In [None]:
accuracy_score(y_test, test_preds)