In [29]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack



In [30]:
data_cleaned=pd.read_csv(r'data_cleaned.csv') # Importing data where the texts are translated and cleaned from stopwords

In [33]:
# def process_title(title):
#     """
#     The function is tokenizing text into separate tokens, in this case words
#     """
#     tokenized_title = tokenizer.tokenize(title)
#     return tokenized_title


In [37]:
model = SentenceTransformer('all-MiniLM-L6-v2') #Pretrained sentence transformer to get embeddings for text columns

data_cleaned['Text_vector']=data_cleaned['Text_first200_translated'].apply(lambda x :model.encode(x))
data_cleaned['Title_vector']=data_cleaned['Title_Translated'].apply(lambda x :model.encode(x))

In [38]:

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder on the "Category" column
label_encoder.fit(data_cleaned['Category'])

# Transform the "Category" column to numerical classes
data_cleaned['Category_2'] = label_encoder.transform(data_cleaned['Category'])


In [39]:
data_cleaned['Category'].value_counts()

Մշակույթ            739
Տնտեսություն        719
Կորոնավիրուս        645
Սպորտ               575
Քաղաքականություն    515
ՏՏ ոլորտ            187
Name: Category, dtype: int64

In [40]:
data_cleaned['Category_2'].value_counts()

1    739
4    719
0    645
2    575
5    515
3    187
Name: Category_2, dtype: int64

### Model using first 200 words of text only

In [66]:
# Split the data into training and testing sets
train, test = train_test_split(data_cleaned, random_state=42, test_size=0.3)

# Extract the feature columns and target column for the training set
X_train = train.Text_first200_translated_cleaned
y_train = train.Category_2

# Extract the feature columns and target column for the testing set
X_test = test.Text_first200_translated_cleaned
y_test = test.Category_2



In [67]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, stop_words='english')

# Convert the training set text data to TF-IDF vectors
train_vectors = vectorizer.fit_transform(X_train)

# Convert the testing set text data to TF-IDF vectors
test_vectors = vectorizer.transform(X_test)




In [68]:
# Create an MLPClassifier model
mlp = MLPClassifier()

# Fit the model on the training data
mlp.fit(train_vectors, y_train)

# Predict the target labels for the testing data
mlp_prediction = mlp.predict(test_vectors)



In [70]:
# Calculate the F1 score using weighted averaging
f1 = f1_score(y_test, mlp_prediction, average='weighted')
f1

0.9398312998149743

In [71]:
accuracy_score(y_test, mlp_prediction)


0.9398422090729783

In [72]:
X_train

3273    AmericanArmenian TV star Kim Kardashian visit ...
900     Every year various cultural centers museums li...
422     epidemic situation also tense Artsakh number m...
887     Film production complicated process especially...
1825    morning capital Ukraine targeted drones Presid...
                              ...                        
1095    future museums restore rethink title Internati...
1130    Yesterday evening biggest theater event year t...
1294    known recent years publishing processes number...
860     Armenia Art Fair2022 art fair opens today Yere...
3174    Minutes ago Lille Slovakian football team beat...
Name: Text_first200_translated_cleaned, Length: 2366, dtype: object

In [73]:
data_cleaned

Unnamed: 0.1,Unnamed: 0,Category,Title,Text,Text_first200,Text_first200_translated,Title_Translated,Text_first200_translated_cleaned,Title_Translated_cleaned,Text_vector,Title_vector,Category_2
0,0,Կորոնավիրուս,ԱՀԿ-ը Չինաստանին խնդրում է համագործակցել COVID...,Առողջապահության համաշխարհային կազմակերպությու...,Առողջապահության համաշխարհային կազմակերպություն...,The World Health Organization (WHO) is confide...,WHO asks China to cooperate in identifying the...,World Health Organization confident China sign...,asks China cooperate identifying origin COVID19,"[-0.081531666, 0.049715642, -0.03568201, 0.048...","[-0.08257137, 0.05076738, -0.015521194, -0.012...",0
1,1,Կորոնավիրուս,ԱՄՆ նախագահը ստորագրեց COVID-19-ի արտակարգ դրո...,Միացյալ Նահանգների նախագահ Ջո Բայդենը հավանու...,Միացյալ Նահանգների նախագահ Ջո Բայդենը հավանութ...,"The President of the United States, Joe Biden,...",The President of the United States signed the ...,President United States Joe Biden approved lif...,President United States signed law end state e...,"[-0.02142138, 0.058397654, 0.09130869, 0.01095...","[-0.03281866, 0.041443933, 0.024626661, -0.010...",0
2,2,Կորոնավիրուս,ԱՄՆ հետախուզությանը հանձնարարվել է կորոնավիրու...,Միացյալ Նահանգների նախագահ Ջո Բայդենը երեկ օր...,Միացյալ Նահանգների նախագահ Ջո Բայդենը երեկ օրե...,United States President Joe Biden signed a law...,US intelligence has been instructed to declass...,United States President Joe Biden signed law y...,US intelligence instructed declassify informat...,"[-0.0761797, 0.08938678, 0.027588481, -0.04014...","[-0.04919257, 0.05779532, -0.03231708, 0.00079...",0
3,3,Կորոնավիրուս,ՀԴԲ տնօրեն. Covid-19-ը ենթադրաբար ծագել է չինա...,Միացյալ Նահանգների Հետաքննությունների դաշնայի...,Միացյալ Նահանգների Հետաքննությունների դաշնային...,The director of the Federal Bureau of Investig...,Director of the FBI. Covid-19 is believed to h...,director Federal Bureau Investigation FBI Unit...,Director FBI Covid19 believed originated labor...,"[-0.09603007, 0.060622435, -0.030653626, 0.054...","[-0.08993958, -0.0047365096, -0.08058686, 0.00...",0
4,4,Կորոնավիրուս,ԱՄՆ-ում նորից խոսում են կորոնավիրուսի «չինական...,Միացյալ Նահանգներում կենսաբանական լաբորատորիա...,Միացյալ Նահանգներում կենսաբանական լաբորատորիան...,"The Department of Energy, which oversees the a...","In the USA, they are again talking about the ""...",Department Energy oversees activities biologic...,USA talking Chinese trace coronavirus,"[-0.047141563, 0.0538676, 0.012197902, 0.07695...","[-0.014239151, 0.028358832, 0.0043151435, 0.05...",0
...,...,...,...,...,...,...,...,...,...,...,...,...
3375,3375,ՏՏ ոլորտ,«Համադասարանցիներ» սոցիալական ցանցը ուղիղ վիդե...,Toyota-ն և Uber-ը ռազմավարական գործընկերությո...,Toyota-ն և Uber-ը ռազմավարական գործընկերությու...,Toyota and Uber have formed a strategic partne...,"The ""Classmates"" social network is launching a...",Toyota Uber formed strategic partnership Japan...,Classmates social network launching live video...,"[-0.0033210143, -0.032762624, 0.014608071, -0....","[-0.0063318717, -0.079445325, -0.044393063, -0...",3
3376,3376,ՏՏ ոլորտ,Այգեկ գյուղի ինժեներական լաբորատորիայում երեխա...,Միացյալ Նահանգների Հետաքննությունների դաշնայի...,Միացյալ Նահանգների Հետաքննությունների դաշնային...,The United States Federal Bureau of Investigat...,In the engineering laboratory of Aygek village...,United States Federal Bureau Investigation dro...,engineering laboratory Aygek village children ...,"[-0.08302667, 0.053029276, -0.008729843, -0.03...","[-0.08819589, 0.03656708, -0.08492272, 0.04892...",3
3377,3377,ՏՏ ոլորտ,Twitter-ը հայտարարում է երկար սպասված փոփոխութ...,Դատարանում Միացյալ Նահանգների արդարադատության...,Դատարանում Միացյալ Նահանգների արդարադատության ...,After yesterday's defeat by the United States ...,Twitter is announcing some long-awaited changes,yesterdays defeat United States Department Jus...,Twitter announcing longawaited changes,"[-0.07224804, 0.035403356, 0.0068919775, -0.06...","[-0.04011054, 0.012772688, 0.05832622, -0.0057...",3
3378,3378,ՏՏ ոլորտ,Toyota-ն և Uber-ը ռազմավարական գործընկերությու...,1989 թվականին թողարկված «Վերադարձ դեպի ապագա ...,1989 թվականին թողարկված «Վերադարձ դեպի ապագա 2...,"The hero of the 1989 science fiction film ""Bac...",Toyota and Uber have formed a strategic partne...,hero 1989 science fiction film Back Future 2 a...,Toyota Uber formed strategic partnership,"[-0.112825565, 0.006945745, 0.01304844, -0.030...","[0.047724172, 0.010241674, -0.03746954, -0.067...",3


### Model Using title and first 200 words of text

In [74]:
# Split the data into training and testing sets
train, test = train_test_split(data_cleaned, random_state=42, test_size=0.3)

# Concatenate the two text columns for the training set
X_train = train['Text_first200_translated_cleaned'] + ' ' + train['Title_Translated_cleaned']

# Concatenate the two text columns for the testing set
X_test = test['Text_first200_translated_cleaned'] + ' ' + test['Title_Translated_cleaned']

# Extract the target column for the training and testing sets
y_train = train['Category_2']
y_test = test['Category_2']





In [75]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, stop_words='english')

# Convert the training set text data to TF-IDF vectors
train_vectors = vectorizer.fit_transform(X_train)

# Convert the testing set text data to TF-IDF vectors
test_vectors = vectorizer.transform(X_test)

In [76]:

# Create an MLPClassifier model
mlp = MLPClassifier()

# Fit the model on the training data
mlp.fit(train_vectors, y_train)

# Predict the target labels for the testing data
mlp_prediction = mlp.predict(test_vectors)

# Calculate the F1 score using weighted averaging
f1 = f1_score(y_test, mlp_prediction, average='weighted')

In [77]:
f1

0.9505752142533993

In [78]:
accuracy_score(y_test, mlp_prediction)

0.9506903353057199

### Model Using title and first 200 words of text separately


In [83]:
# Split the data into training and testing sets
train, test = train_test_split(data_cleaned, random_state=42, test_size=0.3)

X_train = train[['Text_first200_translated_cleaned','Title_Translated_cleaned']]

X_test = test[['Text_first200_translated_cleaned','Title_Translated_cleaned']]

y_train = train['Category_2']
y_test = test['Category_2']

In [84]:
X_train

Unnamed: 0,Text_first200_translated_cleaned,Title_Translated_cleaned
3273,AmericanArmenian TV star Kim Kardashian visit ...,DataArt company also operate Yerevan
900,Every year various cultural centers museums li...,World Poetry Day celebrated Armenia
422,epidemic situation also tense Artsakh number m...,Lets infect others others infect us Vaccinatio...
887,Film production complicated process especially...,Jivan Avetisyans film Heavens Gate shown Ameri...
1825,morning capital Ukraine targeted drones Presid...,corruption scandal European Parliament abating
...,...,...
1095,future museums restore rethink title Internati...,113 museums Armenia Artsakh involved events In...
1130,Yesterday evening biggest theater event year t...,best known Artavazd 20th award ceremony held
1294,known recent years publishing processes number...,Dont Kill Tomorrows Book initiative seeing suc...
860,Armenia Art Fair2022 art fair opens today Yere...,Armenia Art Fair2022 modern art fair starting


In [102]:
# Create a TF-IDF vectorizer
vectorizer1 = TfidfVectorizer(max_df=0.9, min_df=1, stop_words='english')

# Convert the training set text data to TF-IDF vectors
train_vectors1 = vectorizer1.fit_transform(X_train['Text_first200_translated_cleaned'])


vectorizer2 = TfidfVectorizer(max_df=0.9, min_df=1, stop_words='english')

# Convert the training set text data to TF-IDF vectors
train_vectors2 = vectorizer2.fit_transform(X_train['Title_Translated_cleaned'])
# train_vectors2 = vectorizer.fit_transform(X_train)


# Convert the testing set text data to TF-IDF vectors
test_vectors1 = vectorizer1.transform(X_test['Text_first200_translated_cleaned'])
test_vectors2 = vectorizer2.transform(X_test['Title_Translated_cleaned'])

In [105]:

train_vectors_combined = hstack((train_vectors1, train_vectors2))
test_vectors_combined = hstack((test_vectors1, test_vectors2))



In [107]:

# Create an MLPClassifier model
mlp = MLPClassifier()

# Fit the model on the training data
mlp.fit(train_vectors_combined, y_train_reshaped)

# Predict the target labels for the testing data
mlp_prediction = mlp.predict(test_vectors_combined)

# Calculate the F1 score using weighted averaging
f1 = f1_score(y_test, mlp_prediction, average='weighted')

In [108]:
f1

0.9497501888891813

In [109]:
accuracy_score(y_test, mlp_prediction)

0.9497041420118343

### Bert Vectorization

In [146]:
# Split the data into training and testing sets
train, test = train_test_split(data_cleaned, random_state=42, test_size=0.3)
train_features=train['Text_vector'].apply(pd.Series)
test_features=test['Text_vector'].apply(pd.Series)

# Extract the feature columns and target column for the training set
X_train = train_features
y_train = train.Category_2

# Extract the feature columns and target column for the testing set
X_test = test_features
y_test = test.Category_2



In [147]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
3273,-0.018321,-0.052357,-0.018651,-0.047124,0.013237,0.006286,0.077416,-0.041525,-0.002589,-0.015058,...,0.092750,0.060206,0.055140,-0.025249,-0.021813,0.081310,0.008720,-0.077827,-0.025864,0.024092
900,-0.031328,0.035341,0.051224,0.009007,-0.126093,0.068837,0.077541,-0.005875,0.073230,-0.019220,...,-0.002163,0.050991,0.000793,0.027465,-0.040250,0.036279,0.020731,-0.066904,-0.014063,-0.062386
422,0.052998,0.003873,-0.033136,0.052382,-0.005269,0.013558,0.020203,0.014771,-0.041350,0.032940,...,-0.076335,0.033985,-0.001367,-0.027902,0.031219,-0.040230,0.059878,-0.071884,-0.062483,0.053956
887,-0.079986,0.008797,-0.075535,-0.063821,-0.042241,0.057083,-0.006113,-0.042671,0.141165,0.020186,...,0.100943,0.011537,0.012144,0.014254,0.011945,0.066255,0.065277,-0.040160,0.002799,0.053757
1825,0.008638,0.092242,0.007948,0.011786,0.057034,-0.051632,0.059525,-0.073021,0.010913,0.030849,...,0.021015,0.016952,-0.035221,-0.056255,0.055632,0.017367,-0.009059,-0.016018,-0.066937,0.002066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,-0.009962,0.118509,-0.002937,0.027881,-0.021739,0.004040,-0.011777,-0.042067,-0.016062,-0.038522,...,-0.031697,0.031880,-0.038804,0.011694,-0.034657,0.020018,0.096620,-0.047898,-0.080932,0.026744
1130,-0.000267,0.044392,-0.084331,-0.070166,-0.092035,0.110811,0.090509,-0.000427,-0.006379,-0.010363,...,0.033917,-0.020676,-0.011809,0.066080,-0.003897,0.036833,-0.036978,0.024462,-0.027776,0.010644
1294,-0.065438,0.040576,-0.020869,-0.034447,-0.065259,0.017677,0.044606,0.039799,0.107024,0.111050,...,0.003349,0.049428,-0.014817,0.039712,0.041138,0.003920,0.066994,-0.092629,-0.032323,0.060776
860,-0.043927,-0.010038,-0.000662,-0.031824,0.013851,0.054518,-0.009458,-0.082221,-0.003457,0.039939,...,0.050679,0.041752,-0.055694,0.023295,-0.019691,0.073977,0.066427,-0.029612,-0.028127,0.024482


In [148]:
# Create an MLPClassifier model
mlp = MLPClassifier()

# Fit the model on the training data
mlp.fit(X_train, y_train)

# Predict the target labels for the testing data
mlp_prediction = mlp.predict(X_test)





In [149]:
# Calculate the F1 score using weighted averaging
f1 = f1_score(y_test, mlp_prediction, average='weighted')
f1

0.9259338796080181

In [150]:
accuracy_score(y_test, mlp_prediction)


0.9260355029585798

### Grid Search of MLP classifier

In [82]:


# Split the data into training and testing sets
train, test = train_test_split(data_cleaned, random_state=42, test_size=0.3)

# Define the feature and target columns
X_train = train['Text_first200_translated_cleaned'] + ' ' + train['Title_Translated_cleaned']
y_train = train['Category_2']
X_test = test['Text_first200_translated_cleaned'] + ' ' + test['Title_Translated_cleaned']
y_test = test['Category_2']

# Create a pipeline with TfidfVectorizer and MLPClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mlp', MLPClassifier())
])

# Define the parameter grid for grid search
param_grid = {
    'tfidf__max_df': [0.9, 0.95],
    'tfidf__min_df': [1, 2],
    'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50)],
#     'mlp__activation': ['relu', 'tanh'],
    'mlp__alpha': [0.0001, 0.001, 0.01]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Predict the target labels for the testing data using the best model
y_pred = grid_search.predict(X_test)

# Calculate the F1 score using weighted averaging
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the best parameters, best score, and F1 score
print("Best Parameters:", best_params)
print("Best Score:", best_score)
print("F1 Score:", f1)




Best Parameters: {'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (100,), 'tfidf__max_df': 0.9, 'tfidf__min_df': 1}
Best Score: 0.945054945054945
F1 Score: 0.9507063442010122


In [None]:
#Default parameters
# MLPClassifier(
#     hidden_layer_sizes=(100,),
#     activation='relu',
#     solver='adam',
#     alpha=0.0001,
#     batch_size='auto',
#     learning_rate='constant',
#     learning_rate_init=0.001,
#     power_t=0.5,
#     max_iter=200,
#     shuffle=True,
#     random_state=None,
#     tol=1e-4,
#     verbose=False,
#     warm_start=False,
#     momentum=0.9,
#     nesterovs_momentum=True,
#     early_stopping=False,
#     validation_fraction=0.1,
#     beta_1=0.9,
#     beta_2=0.999,
#     epsilon=1e-8,
#     n_iter_no_change=10,
#     max_fun=15000
# )
