In [32]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import nltk 
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import json

In [27]:
train_df = pd.read_csv("train.csv")
train_df.head(2)

Unnamed: 0,tweet,Airtel_Pos,Airtel_Neut,Airtel_Neg,Saf_Pos,Saf_Neut,Saf_Neg,clean_tweets
0,@emukala85 Apologies for the inconveniences ca...,0,1,0,0,0,1,apologies inconveniences caused notified recha...
1,Communications Authority: Airtel Kenya had the...,0,0,1,0,1,0,communications authority airtel kenya worst mo...


In [9]:
len(train_df)

3274

In [10]:
cv, tfidf = CountVectorizer(), TfidfVectorizer()
cv.fit(train_df['clean_tweets']), tfidf.fit(train_df['clean_tweets'])

(CountVectorizer(), TfidfVectorizer())

In [11]:
X = train_df['clean_tweets']
y = train_df.drop(columns=["tweet", "clean_tweets"], axis=1) 

In [12]:
X_cv, X_tfidf = cv.transform(X), tfidf.transform(X)

In [14]:
X_train_cv, X_test_cv, y_train, y_test = train_test_split(X_cv, y, test_size=0.3, random_state=2022)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=2022)

In [22]:
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), SVC(), MLPClassifier()]
ovr_models = list()
for model in models:
    ovr_model = OneVsRestClassifier(model)
    ovr_models.append(ovr_model)
ovr_models

[OneVsRestClassifier(estimator=LogisticRegression()),
 OneVsRestClassifier(estimator=DecisionTreeClassifier()),
 OneVsRestClassifier(estimator=RandomForestClassifier()),
 OneVsRestClassifier(estimator=SVC()),
 OneVsRestClassifier(estimator=MLPClassifier())]

In [24]:
#Fitting on the two sets of vectors
model_accs_cv = []
model_f1s_cv = []
model_names = ["logistic regression", "decision tree", "random forest", "support vector machine", "perceptron"]
for model in ovr_models:
    model.fit(X_train_cv,y_train)
    predictions = model.predict(X_test_cv)
    f1_score_cv = f1_score(predictions, y_test, average="weighted")
    acc_score_cv = accuracy_score(predictions, y_test)
    model_accs_cv.append(acc_score_cv), model_f1s_cv.append(f1_score_cv)


# Results
{model: [model accuracy, model f1_score]}

In [28]:
print("Stand Alone Model Results: Count-vectorizer")
cv_experiment_results = dict((z[0], list(z[1:])) for z in zip(model_names, model_accs_cv, model_f1s_cv)) 
print(cv_experiment_results)
with open("count_vectorizer_ml_experiment_results_com.json", "w") as outfile:
    json.dump(cv_experiment_results, outfile)


Stand Alone Model Results: Count-vectorizer
{'logistic regression': [0.2990844354018311, 0.6715337409742348], 'decision tree': [0.1770091556459817, 0.6149613965988247], 'random forest': [0.19328585961342828, 0.6373546468890064], 'support vector machine': [0.07324516785350967, 0.587587155516257], 'perceptron': [0.3896236012207528, 0.7209713366149341]}


In [29]:
#Fitting on the two sets of vectors
model_accs_tfidf = []
model_f1s_tfidf = []
model_names = ["logistic regression", "decision tree", "random forest", "support vector machine", "perceptron"]
for model in ovr_models:
    model.fit(X_train_tfidf,y_train)
    predictions = model.predict(X_test_tfidf)
    f1_score_tfidf = f1_score(predictions, y_test, average="weighted")
    acc_score_tfidf = accuracy_score(predictions, y_test)
    model_accs_tfidf.append(acc_score_tfidf), model_f1s_tfidf.append(f1_score_tfidf)


In [31]:
print("Stand Alone Model Results: TFIDF")
tfidf_experiment_results = dict((z[0], list(z[1:])) for z in zip(model_names, model_accs_tfidf, model_f1s_tfidf)) 
print(tfidf_experiment_results)
with open("TFIDF_ml_experiment_results_com.json", "w") as outfile:
    json.dump(tfidf_experiment_results, outfile)


Stand Alone Model Results: TFIDF
{'logistic regression': [0.07121057985757884, 0.5080021857786827], 'decision tree': [0.15666327568667346, 0.5917940883436114], 'random forest': [0.14954221770091555, 0.615736849166816], 'support vector machine': [0.07324516785350967, 0.5857961421958123], 'perceptron': [0.35300101729399797, 0.7077509465490691]}


In [33]:
#hybrid model
base_estimators = [
    ("log reg", OneVsRestClassifier(LogisticRegression())), 
    ("decision tree", OneVsRestClassifier(DecisionTreeClassifier())), 
    ("random forest", OneVsRestClassifier(RandomForestClassifier())), 
    ("MLP", OneVsRestClassifier(MLPClassifier()))
]
final_estimator = OneVsRestClassifier(StackingClassifier(estimators=base_estimators, final_estimator=OneVsRestClassifier(MLPClassifier())))


In [34]:
#FITTING AND TESTNIG ON CV VECTORS
final_estimator.fit(X_train_cv, y_train)
cv_hybrid_preds = final_estimator.predict(X_test_cv)
cv_hybrid_acc = accuracy_score(y_test, cv_hybrid_preds)
cv_hybrid_f1 = f1_score(y_test, cv_hybrid_preds, average="weighted")
cv_results = {"Stacked cv results": [cv_hybrid_acc, cv_hybrid_f1]}
print(cv_results)
#with open("stacking_cv_results.json", "wb") as outfile:
#    json.dump(cv_results, outfile)


{'Stacked cv results': [0.3957273652085453, 0.7158894610993464]}


TypeError: a bytes-like object is required, not 'str'

In [35]:
#FITTING AND TESTNIG ON CV VECTORS
final_estimator.fit(X_train_tfidf, y_train)
tfidf_hybrid_preds = final_estimator.predict(X_test_tfidf)
tfidf_hybrid_acc = accuracy_score(y_test, tfidf_hybrid_preds)
tfidf_hybrid_f1 = f1_score(y_test, tfidf_hybrid_preds, average="weighted")
tfidf_results = {"Stacked TFIDF results": [tfidf_hybrid_acc, tfidf_hybrid_f1]}
print(tfidf_results)
#with open("stacking_tfidf_results.json", "wb") as outfile:
#    json.dump(tfidf_results, outfile)


{'Stacked TFIDF results': [0.35198372329603256, 0.6986402163730435]}


# Using CV and Conditional Probabilities 

In [41]:
train_data, test_data = train_test_split(train_df, test_size=0.3, random_state=42)
len(test_data)

983

In [55]:
base_est = [
    ("svm", SVC()), 
    ("lr", LogisticRegression())
]
final_est = StackingClassifier(estimators=base_est, final_estimator=SVC())

In [56]:
def pr(y_i, y):
    p = X_train_cv[y == y_i].sum(0)
    return (p+1) / ((y == y_i).sum()+1)

def get_mdl(y):
    y = y.values
    r = np.log(pr(1, y) / pr(0, y))
    m = final_est
    x_nb = X_train_cv.multiply(r)
    return m.fit(x_nb, y), r

preds = np.zeros((len(test_data), 6))
sample_outputs = list()
for i, j in enumerate(y.columns):
    #print('fit', j)
    m, r = get_mdl(y_train[j])
    #com_model = joblib.dump(m, "sample_nb_log_reg.sav")
    preds[:, i] = m.predict(X_test_cv.multiply(r))#[:, 1]
    sample_text = "While Safaricom's bundles can be considered to be fast and long lasting, Airtel's bundles are often cheap."
    enc_text = tfidf.transform([sample_text])
    sample_output = m.predict(enc_text.multiply(r))#[:, 1]
    print(sample_output)
    sample_outputs.append(sample_output)
    # set_of_rs.append(r)



[0]
[0]
[0]
[0]
[0]
[0]


In [58]:
#Model performance:
print("Logistic Regression + SVC plus nb weights using cv results")
print("Accuracy:" +str(accuracy_score(preds, y_test)))
print("F1 Score: " + str(f1_score(preds, y_test, average="weighted")))

Logistic Regression + SVC plus nb weights using cv results
Accuracy:0.2624618514750763
F1 Score: 0.6920179495588085
