In [1]:
# Import all the required libraries 
import numpy as np
import pandas as pd
import re
import string


#import stopwords and text processing libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("punkt")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mathan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mathan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mathan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
#import machine learning libraries

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
df.drop("selected_text", axis = 1, inplace = True)

In [5]:
df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     27481 non-null  object
 1   text       27480 non-null  object
 2   sentiment  27481 non-null  object
dtypes: object(3)
memory usage: 644.2+ KB


In [7]:
df.isnull().sum()

textID       0
text         1
sentiment    0
dtype: int64

In [8]:
df.dropna(inplace = True)

In [9]:
df.isnull().sum()

textID       0
text         0
sentiment    0
dtype: int64

In [11]:
df.describe()

Unnamed: 0,textID,text,sentiment
count,27480,27480,27480
unique,27480,27480,3
top,c0704fd61e,_express i didnt get to see it tonight,neutral
freq,1,1,11117


In [12]:
df = df.drop_duplicates(subset = ["text"], keep = "first")
df = df.reset_index(drop = True)

In [13]:
df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [14]:
df.drop("textID", axis = 1, inplace = True)

In [23]:
def preprocessing_text(text):
    
    # text to lower
    lower = text.lower()
    
    # remove punctuations
    words = text.translate(str.maketrans("", "", string.punctuation))
    
    # remove stopwords
    stop_words = set(stopwords.words("english"))
    tokenize = word_tokenize(text)
    filter_words = [i for i in tokenize if i not in stop_words]
    
    # Stemming
    ps = PorterStemmer()
    stem_words = [ps.stem(i) for i in filter_words]
    
    # Lemmatizing
    ls = WordNetLemmatizer()
    ls_words = [ls.lemmatize(i, pos = 'a') for i in stem_words]
    
    return " ".join(ls_words)

In [24]:
df["text"] = df["text"].apply(preprocessing_text)

In [25]:
df.head()

Unnamed: 0,text,sentiment
0,"I ` respond , I go",neutral
1,sooo sad I miss san diego ! ! !,negative
2,boss bulli ...,negative
3,interview ! leav alon,negative
4,"son **** , ` put releas alreadi bought",negative


In [28]:
x_train_hl, x_test_hl, y_train_hl, y_test_hl = train_test_split(df["text"], df["sentiment"], random_state = 42, test_size = 0.2)

In [29]:
%%time
# pipeline creation
# 1. tfidVectorization
# 2. linearSVC model
pipe = Pipeline([
                 ("tfidf", TfidfVectorizer()),
                 ("model", LinearSVC()) 
])

linear_svc_model_hl = pipe.fit(x_train_hl, y_train_hl)

# Fit the pipeline to the data
prediction = linear_svc_model_hl.predict(x_test_hl)
print("Model Linear SVC")
# predict on test dataset

# print accuracy score
print(round(accuracy_score(y_test_hl, prediction) * 100, 2))
#print confusion matrix
print(confusion_matrix(y_test_hl, prediction))
#print classification report
print(classification_report(y_test_hl, prediction))


Model Linear SVC
66.78
[[ 960  497  115]
 [ 392 1493  351]
 [  93  378 1217]]
              precision    recall  f1-score   support

    negative       0.66      0.61      0.64      1572
     neutral       0.63      0.67      0.65      2236
    positive       0.72      0.72      0.72      1688

    accuracy                           0.67      5496
   macro avg       0.67      0.67      0.67      5496
weighted avg       0.67      0.67      0.67      5496

Wall time: 1.67 s


In [30]:
%%time
# pipeline creation
# 1. TfidfTransformer
# 2. LogisticRegression model
# 3. CountVectorization
pipe = Pipeline([
                 ("vect", CountVectorizer()),
                 ("tfidf", TfidfTransformer()),
                 ("model", LogisticRegression())
                 
])

log_reg_model_hl = pipe.fit(x_train_hl, y_train_hl)

# Fit the pipeline to the data
prediction = log_reg_model_hl.predict(x_test_hl)
print("Model LogisticRegression")
# predict on test dataset

# print accuracy score
print(round(accuracy_score(y_test_hl, prediction) * 100, 2))
#print confusion matrix
print(confusion_matrix(y_test_hl, prediction))
#print classification report
print(classification_report(y_test_hl, prediction))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model LogisticRegression
68.58
[[ 913  559  100]
 [ 300 1663  273]
 [  74  421 1193]]
              precision    recall  f1-score   support

    negative       0.71      0.58      0.64      1572
     neutral       0.63      0.74      0.68      2236
    positive       0.76      0.71      0.73      1688

    accuracy                           0.69      5496
   macro avg       0.70      0.68      0.68      5496
weighted avg       0.69      0.69      0.69      5496

Wall time: 4.87 s


In [31]:
%%time
# pipeline creation 
# 1. CountVectorizer
# 2. TfidTransformer
# 3. GradientBoostingClassifier
pipe = Pipeline([
                 ("vect", CountVectorizer()),
                 ("tfidf", TfidfTransformer()),
                 ("model", GradientBoostingClassifier())
                 
])

gb_model_hl = pipe.fit(x_train_hl, y_train_hl)

# Fit the pipeline to the data
prediction = gb_model_hl.predict(x_test_hl)
print("Model BernoulliNB")
# predict on test dataset

# print accuracy score
print(round(accuracy_score(y_test_hl, prediction) * 100, 2))
#print confusion matrix
print(confusion_matrix(y_test_hl, prediction))
#print classification report
print(classification_report(y_test_hl, prediction))

Model BernoulliNB
65.92
[[ 665  807  100]
 [ 157 1836  243]
 [  32  534 1122]]
              precision    recall  f1-score   support

    negative       0.78      0.42      0.55      1572
     neutral       0.58      0.82      0.68      2236
    positive       0.77      0.66      0.71      1688

    accuracy                           0.66      5496
   macro avg       0.71      0.64      0.65      5496
weighted avg       0.69      0.66      0.65      5496

Wall time: 35.5 s


In [32]:
%%time
# pipeline creation 
# 1. CountVectorizer
# 2. TfidTransformer
# 3. MultinomialNB
 
pipe = Pipeline([
                 ("vect", CountVectorizer()),
                 ("tfidf", TfidfTransformer()),
                 ("model", MultinomialNB())
                 
])

nb_model_hl = pipe.fit(x_train_hl, y_train_hl)

# Fit the pipeline to the data
prediction = nb_model_hl.predict(x_test_hl)
print("Model Multinomial")
# predict on test dataset

# print accuracy score
print(round(accuracy_score(y_test_hl, prediction) * 100, 2))
#print confusion matrix
print(confusion_matrix(y_test_hl, prediction))
#print classification report
print(classification_report(y_test_hl, prediction))

Model Multinomial
61.12
[[ 611  884   77]
 [ 167 1824  245]
 [  39  725  924]]
              precision    recall  f1-score   support

    negative       0.75      0.39      0.51      1572
     neutral       0.53      0.82      0.64      2236
    positive       0.74      0.55      0.63      1688

    accuracy                           0.61      5496
   macro avg       0.67      0.58      0.59      5496
weighted avg       0.66      0.61      0.60      5496

Wall time: 939 ms


In [33]:
sent1 = ['GST officers detect Rs 4,000 crore of ITC fraud in April-June']
y_predict = linear_svc_model_hl.predict(sent1)
y_predict

array(['neutral'], dtype=object)

In [34]:
sent2 = ["Finance Ministry releases Rs 9,871 crore to 17 states as grant"]
y_predict = linear_svc_model_hl.predict(sent2)
y_predict

array(['neutral'], dtype=object)

In [35]:
sent2 = ["sad"]
y_predict = linear_svc_model_hl.predict(sent2)
y_predict

array(['negative'], dtype=object)

In [36]:
sent2 = ["good"]
y_predict = linear_svc_model_hl.predict(sent2)
y_predict

array(['positive'], dtype=object)

In [39]:
sent2 = ["best"]
y_predict = linear_svc_model_hl.predict(sent2)
y_predict

array(['positive'], dtype=object)