In [1]:
from sklearn.utils import shuffle

import pandas as pd
import numpy as np
#-------------------------- processing ------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# hyper-parameters tuning
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


# ensemble 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
dataset = pd.read_csv("./learningDataset-cleaned")
dataset.dropna(subset=["Tweet Text","tag"], inplace=True)
dataset.head()

# label coding 
tag_codes = {
    "positive" : 1, 
    "negative" : 0,
    "neutral" : -1
}

# category mapping
dataset["tag_code"] = dataset["tag"]
dataset = dataset.replace({"tag_code" :tag_codes})

#y :labels set    
labels = dataset["tag_code"]

#X :dataset without labels
list_text = dataset['Tweet Text']

In [3]:
# *********************** LOGISTIC REGRESSION COUNT
scoring = {'accuracy' : make_scorer(accuracy_score), 
        'precision' : make_scorer(precision_score,average='micro',labels=labels,zero_division=True),
        'recall' : make_scorer(recall_score,average='micro',labels=labels,zero_division=True), 
        'f1_score' : make_scorer(f1_score,average='micro',labels=labels,zero_division=True)}


lr_tfidf_pipe = Pipeline([
    ('vect',CountVectorizer()),
   # ('tfidf', TfidfTransformer()),
    ('fselect',SelectKBest(chi2)),
    ('clf', LogisticRegression()),     
    ])

lr_tfidf_params ={
            'vect__ngram_range': ((1,1),(1,2)),
            'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
            #'clf__max_iter': [1500],
            'vect__max_df':(0.65,0.75,0.85,1.0),
            'fselect__k':[1000,2000,3000,3500,3700,"all"]                    
}

grid = GridSearchCV(lr_tfidf_pipe, lr_tfidf_params, scoring=scoring,cv=10,refit="accuracy",n_jobs=-1)

X,y = shuffle(list_text,labels,random_state=123)

result = grid.fit(X,y)

print(grid.best_params_)
print(grid.best_score_)

{'clf__C': 1, 'fselect__k': 'all', 'vect__max_df': 0.65, 'vect__ngram_range': (1, 1)}
0.7899696610922007


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [4]:
from sklearn.metrics import classification_report

test_dataset = pd.read_csv("./testingDataset-cleaned-super.csv")
test_dataset.dropna(subset=["Tweet Text","tag"], inplace=True)


# label coding 
tag_codes = {
    "positive" : 1,     
    "neutral" : -1,
    "negative" : 0
}

# category mapping
test_dataset["tag_code"] = test_dataset["tag"]
test_dataset = test_dataset.replace({"tag_code" :tag_codes})

test_dataset.to_csv("file.csv")


X_test = test_dataset['Tweet Text']
y_test = test_dataset["tag_code"]

y_pred = grid.predict(X_test)
#target_names = [0, 1, -1]
print(classification_report(y_test, y_pred,labels=[1,0,-1]))

              precision    recall  f1-score   support

           1       0.80      0.58      0.67       183
           0       0.65      0.58      0.61       150
          -1       0.50      0.70      0.58       160

    accuracy                           0.62       493
   macro avg       0.65      0.62      0.62       493
weighted avg       0.65      0.62      0.62       493

