In [18]:
from lemmatizationText import computeLemmatizationText
from sklearn.utils import shuffle

import pandas as pd
import numpy as np
#-------------------------- processing ------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# hyper-parameters tuning
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# ensemble 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [19]:
dataset = pd.read_csv("./apple_tweets_for_modelling")
dataset.dropna(subset=["Tweet Text","tag"], inplace=True)
dataset.head()

# label coding 
tag_codes = {
    "positive" : 1, 
    "negative" : 0,
    "neutral" : -1
}

# category mapping
dataset["tag_code"] = dataset["tag"]
dataset = dataset.replace({"tag_code" :tag_codes})

#y :labels set    
labels = dataset["tag_code"]

#X :dataset without labels
lemmatized_dataset = []
column_name = "Tweet Text"
lemmatized_dataset = computeLemmatizationText(dataset,column_name)

print(lemmatized_dataset)



In [48]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}



svm_count_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',svm.SVC())    
    ])

svm_count_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C':[0.001,0.01,10],
            'clf__gamma':[1,0.01],
            'vect__max_df':(0.65,0.75,1.0),
            'fselect__k':[1000,2000,"all"]                    
}

grid = GridSearchCV(svm_count_pipe, svm_count_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)

{'clf__C': 10, 'clf__gamma': 0.01, 'fselect__k': 2000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 1)}


In [71]:
#print(grid.best_score_)

#print(grid.get_params())

svm_results ={
        "accuracy mean scores":[],
        "accuracy std scores":[],
        "precision mean scores":[],
        "precision std scores":[],
        "recall mean scores":[],
        "recall std scores":[],
        "f1_score mean scores":[],
        "f1_score std scores":[],
        "best params": []
}

#print(best_parameters)
print(grid.best_params_)
svm_results["best params"].append(grid.best_params_)
for scorer in scoring:
    best_index = np.nonzero(grid.cv_results_['rank_test_%s' % scorer] ==1)[0][0]
    best_mean_score = grid.cv_results_['mean_test_%s' % scorer][best_index]
    best_std_score = grid.cv_results_['std_test_%s' % scorer][best_index]
    svm_results["%s mean scores" % scorer].append(best_mean_score)
    svm_results["%s std scores" % scorer].append(best_std_score)
   
print(svm_results)

{'clf__C': 10, 'clf__gamma': 0.01, 'fselect__k': 2000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 1)}
{'accuracy mean scores': [0.7534364261168385], 'accuracy std scores': [0.011348663929229312], 'precision mean scores': [0.7578010793327882], 'precision std scores': [0.011233795957342774], 'recall mean scores': [0.752350301032768], 'recall std scores': [0.010942702159913106], 'f1_score mean scores': [0.7550645449854537], 'f1_score std scores': [0.011044535244853713], 'best params': [{'clf__C': 10, 'clf__gamma': 0.01, 'fselect__k': 2000, 'vect__max_df': 0.65, 'vect__ngram_range': (1, 1)}]}


In [44]:
print(grid.cv_results_)

{'mean_fit_time': array([1.57144348, 1.22249357, 0.78391012, 0.84822774, 0.69006515,
       0.81810951, 0.7277449 , 0.97560747, 0.74454276, 0.98530054,
       0.79654757, 0.93320998, 0.82373722, 1.32582474, 0.92027529,
       1.35125844, 0.82564529, 1.47490176, 0.79739722, 0.77704414,
       0.68918546, 0.86034671, 0.71680013, 0.87650371, 1.40224075,
       1.3917954 , 0.95807123, 1.19016767, 0.93849182, 1.07199725,
       1.1480643 , 1.76841466, 1.10435247, 2.53293729, 1.16727304,
       1.7113382 , 0.82213736, 0.88672702, 0.81648747, 1.02761555,
       0.88035369, 0.98225101, 0.99367372, 1.09198229, 1.09817807,
       1.24009482, 1.00251635, 1.19394151, 1.19137287, 1.87807624,
       1.14425715, 1.70637576, 1.13388753, 1.73124719, 0.89334671,
       0.95920897, 0.90221341, 1.07414293, 0.89051421, 1.10390655,
       0.91618085, 1.12808291, 0.81232436, 1.26329525, 0.95846764,
       1.1615212 , 1.08107861, 1.68574874, 1.19886629, 1.65253274,
       1.19794798, 1.76115171, 0.85433022, 1

In [67]:
#SPETTACOLO
ar = np.array([37, 1, 37, 37, 1, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 19, 22, 19, 23, 19, 23, 28, 25, 29, 26, 29, 26, 33,
       34, 31, 35, 31, 35, 10,  9, 15, 13, 15, 13,  1,  2,  3, 17,  3, 17,
        3,  6,  6, 11,  6, 11])

best_index = np.nonzero(ar == 1)


print(best_index)
print(best_index[0])
print(best_index[0][0])

(array([ 1,  4, 96]),)
[ 1  4 96]
1


In [None]:
#logisticregrssion
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}



lr_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',LogisticRegression())    
    ])

lr_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C':[0.001,0.01,10],
            'clf__gamma':[1,0.01],
            'vect__max_df':(0.65,0.75,1.0),
            'fselect__k':[1000,2000,"all"]                    
}

grid = GridSearchCV(lr_pipe, lr_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)
result = grid.fit(X,y)

print(grid.best_params_)

#results

#params

#grid

#raccolta info


In [None]:
#MN-Bayes
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}



svm_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',svm.SVC())    
    ])

svm_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C':[0.001,0.01,10],
            'clf__gamma':[1,0.01],
            'vect__max_df':(0.65,0.75,1.0),
            'fselect__k':[1000,2000,"all"]                    
}

grid = GridSearchCV(svm_pipe, svm_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)
#X = np.array(lemmatized_dataset)
#y = np.array(labels)
result = grid.fit(X,y)

print(grid.best_params_)

#results

#params

#grid

#raccolta info


In [None]:
#CN-Bayes
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}



svm_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',svm.SVC())    
    ])

svm_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C':[0.001,0.01,10],
            'clf__gamma':[1,0.01],
            'vect__max_df':(0.65,0.75,1.0),
            'fselect__k':[1000,2000,"all"]                    
}

grid = GridSearchCV(svm_pipe, svm_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)
#X = np.array(lemmatized_dataset)
#y = np.array(labels)
result = grid.fit(X,y)

print(grid.best_params_)

#results

#params

#grid

#raccolta info


In [None]:
#Decision tree
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}



svm_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('fselect',SelectKBest(chi2)),
    ('clf',svm.SVC())    
    ])

svm_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C':[0.001,0.01,10],
            'clf__gamma':[1,0.01],
            'vect__max_df':(0.65,0.75,1.0),
            'fselect__k':[1000,2000,"all"]                    
}

grid = GridSearchCV(svm_pipe, svm_params, scoring=scoring,cv=3,refit="accuracy",n_jobs=-1)

X,y = shuffle(lemmatized_dataset,labels,random_state=123)
#X = np.array(lemmatized_dataset)
#y = np.array(labels)
result = grid.fit(X,y)

print(grid.best_params_)

#results

#params

#grid

#raccolta info