In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report




In [3]:
# df = pd.read_csv('seventh.csv')
# df.head(10)

In [4]:
df = pd.read_csv('created_new_train_data/add_embedding_features_9.csv')

In [5]:
df.columns

Index(['Sentence_id', 'Text', 'class_label', 'Tokens', 'text_length',
       'sentiment_score_veda', 'sentiment_score_textblob',
       'sentiment_score_bert', 'sentiment_score_roberta', 'bert_sent_neg',
       'bert_sent_pos', 'roberta_sent_neg', 'roberta_sent_neut',
       'roberta_sent_mixed', 'roberta_sent_pos', 'labels', 'names',
       'organizations', 'locations', 'dates', 'verbs', 'action_verbs',
       'filtered_action_verbs', 'joined_tokens', 'count_verb',
       'count_action_verb', 'count_filtered_action_verb', 'count_tokens',
       'cleaned_text', 'cleaned_text_length', 'contains_question_mark',
       'contains_exclamation', 'contains_ellipsis', 'num_exclamations',
       'num_questions', 'num_ellipses', 'punctuation_count', 'embedding'],
      dtype='object')

In [6]:
df.head(2)

Unnamed: 0,Sentence_id,Text,class_label,Tokens,text_length,sentiment_score_veda,sentiment_score_textblob,sentiment_score_bert,sentiment_score_roberta,bert_sent_neg,...,cleaned_text,cleaned_text_length,contains_question_mark,contains_exclamation,contains_ellipsis,num_exclamations,num_questions,num_ellipses,punctuation_count,embedding
0,30313,And so I know that this campaign has caused so...,No,"['campaign', 'caused', 'questioning', 'worries...",118,-0.4939,0.5,"[0.9687058329582214, 0.031294114887714386]","[0.11130036413669586, 0.014399373903870583, 0....",0.968706,...,And so I know that this campaign has caused so...,117,False,False,False,0,0,0,1,[[-2.65752003e-02 2.48833038e-02 -1.60991415e...
1,19099,"Now, let's balance the budget and protect Medi...",No,"['lets', 'balance', 'budget', 'protect', 'medi...",92,0.3818,0.0,"[0.5200970768928528, 0.4799029529094696]","[0.10983521491289139, 0.03143635019659996, 0.7...",0.520097,...,Now lets balance the budget and protect Medica...,87,False,False,False,0,0,0,5,[[-6.90975189e-02 -5.70267588e-02 -1.18759215e...


In [8]:
# print(df['embedding'].iloc[0])

# 12. Adding new feature embedding

In [4]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [125]:
# def get_embeddings(text):
#     inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
#     output = model(**inputs)
#     embedding = output.last_hidden_state[:, 0, :]
#     return embedding.detach().numpy()

In [37]:
# df['embedding'] = df['Text'].apply(get_embeddings)

In [92]:
import ast
import re

In [115]:
df['embedding'] = df['embedding'].apply(lambda x: re.sub(r'\s+', ',', x))

In [103]:
import json

In [118]:
embeddings.dtype

dtype('<U12673')

In [44]:
df = pd.read_csv('add_embedding_features_9.csv')

In [10]:
# convert to numpy array
def convert_to_numpy_array(embd_str):
    #print(f"string: {embd_str}")
    #print(type(embd_str))
    embd_str_clean = embd_str.replace("\n", " ").replace(" ", " ")
    #print(type(embd_str_clean))
    #print(f"string cleansing: {embd_str_clean}")
    embd_str_clean = embd_str_clean.strip('[]')
    #print(type(embd_str_clean))
    #print(f"removing the outer brackets {embd_str_clean}")
    embd_list = embd_str_clean.split()
    #print(f"Split the string into individual values {embd_list}")
    embed_array = np.array(embd_list, dtype = float)
    #print(f"list to array {embed_array}")
    return embed_array

# 13. converted the embedding to overcome the memory error

In [11]:
df['converted_embedding'] = df['embedding'].apply(convert_to_numpy_array)

In [13]:
np.shape(df['converted_embedding'].iloc[0])

(768,)

In [152]:
df[['names', 'organizations', 'dates']]

Unnamed: 0,names,organizations,dates
0,0,0,0
1,0,2,0
2,0,0,0
3,0,1,0
4,0,0,0
...,...,...,...
22496,0,0,0
22497,0,0,0
22498,0,0,1
22499,0,0,0


## arrenging the features for model feeding

## MEMORY ERROR

## 1 - optimized data types -- not worked

In [1]:
# df[['text_length', 'roberta_sent_neg', 'roberta_sent_mixed',
#                             'roberta_sent_pos', 'names', 'organizations', 'dates',
#                             'count_tokens', 'cleaned_text_length', 'punctuation_count']].dtypes

## 14. converted the data type into smaller memory occupied data type (int64 -> int32)

In [27]:
df[['text_length', 'roberta_sent_neg', 'roberta_sent_mixed',
                            'roberta_sent_pos', 'names', 'organizations', 'dates',
                            'count_tokens', 'cleaned_text_length', 'punctuation_count']]

df = df.astype({'text_length': 'int32', 'roberta_sent_neg': 'float32', 'roberta_sent_mixed': 'float32', 'roberta_sent_pos': 'float32',
               'names': 'int32', 'organizations': 'int32', 'dates': 'int32', 'count_tokens': 'int32', 'cleaned_text_length': 'int32', 'punctuation_count': 'int32'})

## 15. - PCA for reducing dimensionality of emebddings

In [34]:
from sklearn.decomposition import PCA

In [131]:
pca = PCA(n_components = 50)
reduced_embeddings = pca.fit_transform(df['converted_embedding'].to_list())

In [134]:
reduced_embeddings = np.array(reduced_embeddings, dtype = np.float32)

In [28]:
# embeddings = np.vstack(df['embedding']) # 2-D array - inputs as feature matrices

In [135]:
reduced_embeddings.shape

(22501, 50)

In [30]:
additional_features = df[['text_length', 'roberta_sent_neg', 'roberta_sent_mixed',
                            'roberta_sent_pos', 'names', 'organizations', 'dates',
                            'count_tokens', 'cleaned_text_length', 'punctuation_count']].to_numpy()

In [31]:
# df.dtypes

In [None]:
df[['text_length', 'roberta_sent_neg', 'roberta_sent_mixed',
                            'roberta_sent_pos', 'names', 'organizations', 'dates',
                            'count_tokens', 'cleaned_text_length', 'punctuation_count']].dtypes

In [136]:
X = np.hstack([reduced_embeddings, additional_features]) # stack horizontally to add additional features with corresponding embedding vectors
y = df['labels']

# 16. model training - RFC and LR

## RANDOMFORESTCLASSIFIER

In [137]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

clf = RandomForestClassifier()
clf.fit(X_train,y_train)

In [138]:
y_pred = clf.predict(X_test)

In [139]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.97      0.90      4229
           1       0.84      0.41      0.55      1397

    accuracy                           0.84      5626
   macro avg       0.84      0.69      0.73      5626
weighted avg       0.84      0.84      0.81      5626



## Logistic Regression

In [None]:
# max_iter

In [144]:
log_reg = LogisticRegression(max_iter=1000)  # Ensure enough iterations for convergence
log_reg.fit(X_train, y_train)

In [145]:
y_pred = log_reg.predict(X_test)
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.87      0.94      0.91      4229
           1       0.77      0.59      0.67      1397

    accuracy                           0.85      5626
   macro avg       0.82      0.76      0.79      5626
weighted avg       0.85      0.85      0.85      5626



# 17. saving the model into pckl file 

In [147]:
import joblib
import os

In [149]:
save_folder = 'models'
os.makedirs(save_folder, exist_ok = True)
model_path = os.path.join(save_folder, 'logistic_reggression.pkl')
joblib.dump(log_reg, model_path)
# model.dump()

['models\\logistic_reggression.pkl']

In [151]:
save_folder = 'models'
os.makedirs(save_folder, exist_ok = True)
model_path = os.path.join(save_folder, 'rfc.pkl')
joblib.dump(clf, model_path)

['models\\rfc.pkl']

In [47]:
para = [{'max_iter':[1, 10, 100, 500, 700, 800, 900, 1000]}]

log_reg = LogisticRegression(random_state = 42)

clf_max_iter = GridSearchCV(log_reg, param_grid=para, cv = 5, scoring='r2')

In [1]:
# clf_max_iter.fit(X_train, y_train)

In [51]:
y_pred = clf_max_iter.predict(X_test)
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4229
           1       0.77      0.63      0.69      1397

    accuracy                           0.86      5626
   macro avg       0.83      0.78      0.80      5626
weighted avg       0.86      0.86      0.86      5626



In [52]:
print(dir(clf_max_iter))

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_build_request_for_signature', '_check_feature_names', '_check_n_features', '_check_refit_for_multimetric', '_doc_link_module', '_doc_link_template', '_doc_link_url_param_generator', '_estimator_type', '_format_results', '_get_default_requests', '_get_doc_link', '_get_metadata_request', '_get_param_names', '_get_routed_params_for_fit', '_get_scorers', '_get_tags', '_more_tags', '_parameter_constraints', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_run_search', '_select_best_index', '_sk_visual_block_', '

In [53]:
print("Best param:",clf_max_iter.best_params_['max_iter'])
print("Best param:",clf_max_iter.best_score_)

Best param: 500
Best param: 0.24908408566213747


In [76]:
df = pd.read_csv('embedding_8.csv')

In [78]:
df.head(10)

Unnamed: 0,Sentence_id,Text,class_label,Tokens,text_length,sentiment_score_veda,sentiment_score_textblob,sentiment_score_bert,sentiment_score_roberta,bert_sent_neg,...,count_tokens,cleaned_text,cleaned_text_length,contains_question_mark,contains_exclamation,contains_ellipsis,num_exclamations,num_questions,num_ellipses,punctuation_count
0,30313,And so I know that this campaign has caused so...,No,"['campaign', 'caused', 'questioning', 'worries...",118,-0.4939,0.5,"[0.9687058329582214, 0.031294114887714386]","[0.11130036413669586, 0.014399373903870583, 0....",0.968706,...,8,And so I know that this campaign has caused so...,117,False,False,False,0,0,0,1
1,19099,"Now, let's balance the budget and protect Medi...",No,"['lets', 'balance', 'budget', 'protect', 'medi...",92,0.3818,0.0,"[0.5200970768928528, 0.4799029529094696]","[0.10983521491289139, 0.03143635019659996, 0.7...",0.520097,...,8,Now lets balance the budget and protect Medica...,87,False,False,False,0,0,0,5
2,33964,I'd like to mention one thing.,No,"['id', 'mention', 'thing']",30,0.3612,0.0,"[0.22529488801956177, 0.774705171585083]","[0.04292938485741615, 0.5959416627883911, 0.27...",0.225295,...,3,Id like to mention one thing,28,False,False,False,0,0,0,2
3,16871,I must remind him the Democrats have controlle...,Yes,"['must', 'remind', 'democrats', 'controlled', ...",124,0.0,0.0,"[0.9874435067176819, 0.012556521221995354]","[0.31787270307540894, 0.20612367987632751, 0.3...",0.987444,...,9,I must remind him the Democrats have controlle...,122,False,False,False,0,0,0,2
4,13150,And to take a chance uh - now be - and not mak...,No,"['take', 'chance', 'effort', 'provide', 'contr...",161,0.2023,0.8,"[0.999810516834259, 0.00018953572725877166]","[0.5015895366668701, 0.01270473375916481, 0.27...",0.999811,...,8,And to take a chance uh now be and not make ev...,155,False,False,False,0,0,0,4
5,13386,"Well, what he is saying there in effect, we're...",No,"['saying', 'effect', 'inflation']",71,0.2732,0.0,"[0.9945316314697266, 0.005468361545354128]","[0.5366693735122681, 0.0423717275261879, 0.159...",0.994532,...,3,Well what he is saying there in effect were go...,67,False,False,False,0,0,0,4
6,28916,I'm proud of the fact that violent crime is do...,Yes,"['im', 'proud', 'fact', 'violent', 'crime', 's...",71,-0.6486,-0.051852,"[0.030331920832395554, 0.9696680307388306]","[0.0429118387401104, 0.24741673469543457, 0.68...",0.030332,...,7,Im proud of the fact that violent crime is dow...,69,False,False,False,0,0,0,2
7,10612,"You know, you may have seen your health care p...",No,"['may', 'seen', 'health', 'care', 'premiums']",60,0.4939,0.0,"[0.9901418089866638, 0.009858217090368271]","[0.21727149188518524, 0.045533761382102966, 0....",0.990142,...,5,You know you may have seen your health care pr...,58,False,False,False,0,0,0,2
8,22058,"If we're $4 trillion down, we should have ever...",Yes,"['4', 'trillion', 'everything', 'perfect', 'do...",75,0.3291,0.422222,"[0.03299140930175781, 0.9670085906982422]","[0.08489025384187698, 0.012218084186315536, 0....",0.032991,...,5,If were 4 trillion down we should have everyth...,69,False,False,False,0,0,0,6
9,18005,And I made some tough decisions.,No,"['tough', 'decisions']",32,-0.128,-0.388889,"[0.004486490972340107, 0.9955134987831116]","[0.047905098646879196, 0.013652173802256584, 0...",0.004486,...,2,And I made some tough decisions,31,False,False,False,0,0,0,1
