In [1]:
from sklearn.utils import shuffle

import pandas as pd
import numpy as np
#-------------------------- processing ------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# hyper-parameters tuning
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# ensemble 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
dataset = pd.read_csv("./learningDataset-cleaned")
dataset.dropna(subset=["Tweet Text","tag"], inplace=True)
dataset.head()

# label coding 
tag_codes = {
    "positive" : 1, 
    "negative" : 0,
    "neutral" : -1
}

# category mapping
dataset["tag_code"] = dataset["tag"]
dataset = dataset.replace({"tag_code" :tag_codes})

#y :labels set    
labels = dataset["tag_code"]

#X :dataset without labels
list_text = dataset['Tweet Text']

In [3]:
# *********************** DECISION TREE TFIDF
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


dt_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', DecisionTreeClassifier())    
    ])

dt_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__criterion': ['gini', 'entropy'],
            #'clf__max_depth': [2,4,6,8,10,12],
            #'clf__min_samples_split': range(2,10),
            #'clf__min_samples_leaf': range(1,5),
            'vect__ngram_range': ((1,1),(1,2)),
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                     
}

grid = GridSearchCV(dt_tfidf_pipe, dt_tfidf_params, scoring=scoring,cv=10,refit="accuracy",n_jobs=-1)

X,y = shuffle(list_text,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

{'clf__criterion': 'gini', 'fselect__k': 1000, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2)}
0.658925928666568


In [4]:
from sklearn.metrics import classification_report

test_dataset = pd.read_csv("./testingDataset-cleaned-super.csv")
test_dataset.dropna(subset=["Tweet Text","tag"], inplace=True)


# label coding 
tag_codes = {
    "positive" : 1,     
    "neutral" : -1,
    "negative" : 0
}

# category mapping
test_dataset["tag_code"] = test_dataset["tag"]
test_dataset = test_dataset.replace({"tag_code" :tag_codes})

test_dataset.to_csv("file.csv")


X_test = test_dataset['Tweet Text']
y_test = test_dataset["tag_code"]

y_pred = grid.predict(X_test)
#target_names = [0, 1, -1]
print(classification_report(y_test, y_pred,labels=[1,0,-1]))

              precision    recall  f1-score   support

           1       0.65      0.45      0.53       183
           0       0.51      0.50      0.50       150
          -1       0.49      0.67      0.57       160

    accuracy                           0.54       493
   macro avg       0.55      0.54      0.53       493
weighted avg       0.55      0.54      0.54       493



In [None]:
import joblib
#save your model or results
joblib.dump(grid, 'model_dt_tfidf.pkl')