In [2]:
#from lemmatizationText import computeLemmatizationText
from sklearn.utils import shuffle

import pandas as pd
import numpy as np
#-------------------------- processing ------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# hyper-parameters tuning
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# ensemble 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
dataset = pd.read_csv("./learningDataset-cleaned")
dataset.dropna(subset=["Tweet Text","tag"], inplace=True)
dataset.head()

# label coding 
tag_codes = {
    "positive" : 1, 
    "negative" : 0,
    "neutral" : -1
}

# category mapping
dataset["tag_code"] = dataset["tag"]
dataset = dataset.replace({"tag_code" :tag_codes})

#y :labels set    
labels = dataset["tag_code"]

#X :dataset without labels
lemmatized_dataset = []
column_name = "Tweet Text"
#lemmatized_dataset = computeLemmatizationText(dataset,column_name)

list_text = dataset['Tweet Text']

In [None]:
# *********************** SVM COUNT
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


svm_count_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',svm.SVC())    
    ])

svm_count_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C':[0.001,0.01,0.1,1,10,100],
            'clf__gamma':[1,0.01,0.01,0.001],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(svm_count_pipe, svm_count_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

# best_score (accuracy) 0.7585910652920962
# best_params: {'clf__C': 100, 'clf__gamma': 0.001, 'fselect__k': 3000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 2)}


{'clf__C': 100, 'clf__gamma': 0.001, 'fselect__k': 3000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 2)}


In [5]:
# *********************** SVM TFIDF
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


svm_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',svm.SVC())    
    ])

svm_tfidf_params ={
            #'vect__ngram_range': ((1,1),(1,2)),
            #'clf__C':[0.001,0.01,0.1,1,10,100],
            #'clf__gamma':[1,0.01,0.01,0.001],
            #'vect__max_df':(0.65,0.75,0.85,1.0),
            #'fselect__k':[1000,2000,3000,3500,3700,"all"]    
            'clf__C': [100], 'clf__gamma': [1], 
            'fselect__k': [3000], 'vect__max_df': [0.65],
            'vect__ngram_range': ((1, 1),(1,2))            
}

grid = GridSearchCV(svm_tfidf_pipe, svm_tfidf_params, scoring=scoring,cv=10,refit="accuracy",n_jobs=-1)

X,y = shuffle(list_text,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
#{'clf__C': 100, 'clf__gamma': 1, 'fselect__k': 3000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 1)}
#0.791237113402062    time:38min 42s

{'clf__C': 100, 'clf__gamma': 1, 'fselect__k': 3000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 1)}


In [7]:
from sklearn.metrics import classification_report

test_dataset = pd.read_csv("./testingDataset-cleaned-super.csv")
test_dataset.dropna(subset=["Tweet Text","tag"], inplace=True)


# label coding 
tag_codes = {
    "positive" : 1,     
    "neutral" : -1,
    "negative" : 0
}

# category mapping
test_dataset["tag_code"] = test_dataset["tag"]
test_dataset = test_dataset.replace({"tag_code" :tag_codes})

test_dataset.to_csv("file.csv")


X_test = test_dataset['Tweet Text']
y_test = test_dataset["tag_code"]

y_pred = grid.predict(X_test)
#target_names = [0, 1, -1]
print(classification_report(y_test, y_pred,labels=[1,0,-1]))

              precision    recall  f1-score   support

           1       0.85      0.60      0.70       183
           0       0.68      0.57      0.62       150
          -1       0.55      0.81      0.66       160

    accuracy                           0.66       493
   macro avg       0.69      0.66      0.66       493
weighted avg       0.70      0.66      0.66       493



In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred, labels=[0,-1,1]))

[[ 66  24  10]
 [ 23  84   7]
 [ 17  24 104]]


In [None]:
from sklearn.metrics import classification_report

test_dataset = pd.read_csv("./learningDataset-cleaned")
test_dataset.dropna(subset=["Tweet Text","tag"], inplace=True)


# label coding 
tag_codes = {
    "positive" : 1, 
    "negative" : 0,
    "neutral" : -1
}

# category mapping
test_dataset["tag_code"] = test_dataset["tag"]
test_dataset = test_dataset.replace({"tag_code" :tag_codes})

test_dataset.to_csv("file.csv")


X_test = test_dataset['Tweet Text']
y_test = test_dataset["tag_code"]

y_pred = grid.predict(X_test)
target_names = [0, 1, -1]
print(classification_report(y_test, y_pred,labels=[0,-1,1]))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       794
          -1       0.96      0.99      0.98       713
           1       0.99      0.98      0.98       821

    accuracy                           0.98      2328
   macro avg       0.98      0.98      0.98      2328
weighted avg       0.98      0.98      0.98      2328



In [None]:

print(grid.best_score_)

0.791237113402062


In [None]:
# *********************** MNB TFIDF
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


nb_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',MultinomialNB())    
    ])

nb_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(nb_tfidf_pipe, nb_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

{'fselect__k': 2000, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
0.711340206185567


In [None]:
# *********************** MNB COUNT
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


nb_count_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',MultinomialNB())     
    ])

nb_count_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(nb_count_pipe, nb_count_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

#{'fselect__k': 'all', 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
#0.738831615120275

{'fselect__k': 'all', 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
0.738831615120275


In [None]:
# *********************** LOGISTIC REGRESSION COUNT

scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


lr_count_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', LogisticRegression()),     
    ])

lr_count_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'clf__max_iter': [1500],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(lr_count_pipe, lr_count_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)
#{'clf__C': 1, 'clf__max_iter': 1500, 'fselect__k': 3000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 1)}
#0.7762027491408935

In [None]:
# *********************** LOGISTIC REGRESSION TFIDF
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


lr_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', LogisticRegression()),     
    ])

lr_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'clf__max_iter': [1500],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(lr_tfidf_pipe, lr_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

#{'clf__C': 10, 'clf__max_iter': 1500, 'fselect__k': 1000, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
#0.7925257731958762


{'clf__C': 10, 'clf__max_iter': 1500, 'fselect__k': 1000, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
0.7925257731958762


In [None]:
# *********************** CNB TFIDF

scoring = {'accuracy' : make_scorer(accuracy_score),
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


cnb_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', ComplementNB()),    
    ])

cnb_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(cnb_tfidf_pipe, cnb_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)
#{'fselect__k': 2000, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
0.7474226804123711

{'fselect__k': 2000, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 1)}
0.7474226804123711


In [None]:
# *********************** CNB COUNT
 scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


cnb_count_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', ComplementNB()),    
    ])

cnb_count_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(cnb_count_pipe, cnb_count_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# *********************** DECISION TREE COUNT

  scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


dt_count_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', DecisionTreeClassifier())   
    ])

dt_count_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__criterion': ['gini', 'entropy'],
            #'clf__max_depth': [2,4,6,8,10,12],
            #'clf__min_samples_split': range(2,10),
            #'clf__min_samples_leaf': range(1,5),
            'vect__ngram_range': ((1,1),(1,2)),
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(dt_count_pipe, dt_count_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# *********************** DECISION TREE TFIDF
 scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


dt_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', DecisionTreeClassifier())    
    ])

dt_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__criterion': ['gini', 'entropy'],
            #'clf__max_depth': [2,4,6,8,10,12],
            #'clf__min_samples_split': range(2,10),
            #'clf__min_samples_leaf': range(1,5),
            'vect__ngram_range': ((1,1),(1,2)),
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                     
}

grid = GridSearchCV(dt_tfidf_pipe, dt_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# *********************** BAGGING TFIDF
 scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


bag_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', BaggingClassifier())   
    ])

bag_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__n_estimators': [10, 30, 50],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                     
}

grid = GridSearchCV(bag_tfidf_pipe, bag_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# *********************** BAGGING COUNT

 scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


bag_count_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', BaggingClassifier())   
    ])

bag_count_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__n_estimators': [10, 30, 50],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                     
}

grid = GridSearchCV(bag_count_pipe, bag_count_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# *********************** BAGGING TFIDF SVM
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


bag_svm_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', BaggingClassifier(base_estimator=svm.SVC())),  
    ])

bag_svm_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__n_estimators': [10, 30, 50],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                      
}

grid = GridSearchCV(bag_svm_tfidf_pipe, bag_svm_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# *********************** BAGGING TFIDF LR
 scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


bag_lr_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', BaggingClassifier(base_estimator=LogisticRegression())),  
    ])

bag_lr_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__n_estimators': [10, 30, 50],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                      
}

grid = GridSearchCV(bag_lr_tfidf_pipe, bag_lr_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

In [None]:
# *********************** RANDOM FOREST
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


ran_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', RandomForestClassifier()), 
    ])

ran_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__criterion': ["gini", "entropy"],
           # 'clf__n_estimators': [100, 300, 500, 800],
           # 'clf__min_samples_split': [2, 5, 10],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                      
}

grid = GridSearchCV(bag_lr_tfidf_pipe, bag_lr_tfidf_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)