In [1]:
pip install scikit-learn





[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('sentimentenglish.csv', encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [5]:
df['binary_sentiment'] = df['sentiment'].map({'neutral': 2, 'positive': 0, 'negative': 1})

In [6]:
df.sample()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),binary_sentiment
12515,2231fb3466,Totally broke is right. I have to wait till ...,Totally broke is right. I have to wait till Mo...,neutral,night,70-100,Guyana,786552,196850.0,4,2


In [7]:
df = df[['text', 'binary_sentiment']]

In [8]:
df.tail()

Unnamed: 0,text,binary_sentiment
27476,wish we could come see u on Denver husband l...,1
27477,I`ve wondered about rake to. The client has ...,1
27478,Yay good for both of you. Enjoy the break - y...,0
27479,But it was worth it ****.,0
27480,All this flirting going on - The ATG smiles...,2


In [9]:
df.isnull().sum()

text                1
binary_sentiment    0
dtype: int64

In [10]:
df = df.dropna()



In [11]:
df.duplicated().sum()

0

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27480 entries, 0 to 27480
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   text              27480 non-null  object
 1   binary_sentiment  27480 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 644.1+ KB


In [13]:
df.shape

(27480, 2)

In [14]:
df['binary_sentiment'].value_counts()

binary_sentiment
2    11117
0     8582
1     7781
Name: count, dtype: int64

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nabin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nabin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
def preprocess_text(text):
  
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    
    # Join the words back into a single string
    processed_text = ' '.join(words)
    
    return processed_text

In [18]:
df['text'] = df['text'].apply(preprocess_text)

In [19]:
df.tail()

Unnamed: 0,text,binary_sentiment
27476,wish could come see u denver husband lost job ...,1
27477,"` wonder rake . client made clear .net , ` for...",1
27478,yay good . enjoy break - probabl need hectic w...,0
27479,worth * * * * .,0
27480,flirt go - atg smile . yay . ( ( hug ) ),2


In [20]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['binary_sentiment'])

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], encoded_labels, test_size=0.2, random_state=42)

In [23]:
vectorizer = TfidfVectorizer(max_features=18925)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [24]:
print("Processed Texts:", df['text'])
print("Encoded Labels:", encoded_labels)
print("X_train:", X_train_tfidf)
print("X_test:", X_test_tfidf)
print("y_train:", y_train)
print("y_test:", y_test)

Processed Texts: 0                                           ` respond , go
1                            sooo sad miss san diego ! ! !
2                                           boss bulli ...
3                                    interview ! leav alon
4                son * * * * , ` put releas alreadi bought
                               ...                        
27476    wish could come see u denver husband lost job ...
27477    ` wonder rake . client made clear .net , ` for...
27478    yay good . enjoy break - probabl need hectic w...
27479                                      worth * * * * .
27480             flirt go - atg smile . yay . ( ( hug ) )
Name: text, Length: 27480, dtype: object
Encoded Labels: [2 1 1 ... 0 0 2]
X_train:   (0, 11546)	0.3264729758838995
  (0, 6761)	0.5061459813569884
  (0, 4783)	0.4962708183154321
  (0, 8521)	0.5341264876095198
  (0, 12229)	0.3250473991530014
  (1, 8575)	0.41541355502741845
  (1, 8041)	0.3547812359313191
  (1, 8638)	0.346691805383014


In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [26]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

In [27]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tfidf, y_train)

In [28]:
y_pred = naive_bayes_classifier.predict(X_test_tfidf)

In [29]:
class_names = label_encoder.classes_.astype(str)

In [30]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=class_names)

In [31]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6140829694323144

In [32]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.56      0.63      1688
           1       0.75      0.40      0.52      1572
           2       0.54      0.81      0.64      2236

    accuracy                           0.61      5496
   macro avg       0.67      0.59      0.60      5496
weighted avg       0.66      0.61      0.61      5496



In [33]:
confusion = confusion_matrix(y_test, y_pred)
confusion

array([[ 943,   39,  706],
       [  87,  623,  862],
       [ 253,  174, 1809]], dtype=int64)

In [45]:
class_report

'              precision    recall  f1-score   support\n\n           0       0.73      0.56      0.63      1688\n           1       0.75      0.40      0.52      1572\n           2       0.54      0.81      0.64      2236\n\n    accuracy                           0.61      5496\n   macro avg       0.67      0.59      0.60      5496\nweighted avg       0.66      0.61      0.61      5496\n'

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [35]:
# Initialization of SVM Classifier
svm_classifier = SVC()

In [36]:
svm_classifier.fit(X_train_tfidf, y_train)

In [37]:
y_pred_svm = svm_classifier.predict(X_test_tfidf)

In [38]:


# Calculating Accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Creating Confusion Matrix
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)

# Generating Classification Report
class_report_svm = classification_report(y_test, y_pred_svm)


In [39]:
accuracy_svm

0.6957787481804949

In [40]:
conf_matrix_svm

array([[1191,   52,  445],
       [  87,  846,  639],
       [ 237,  212, 1787]], dtype=int64)

In [41]:
print(class_report_svm)

              precision    recall  f1-score   support

           0       0.79      0.71      0.74      1688
           1       0.76      0.54      0.63      1572
           2       0.62      0.80      0.70      2236

    accuracy                           0.70      5496
   macro avg       0.72      0.68      0.69      5496
weighted avg       0.71      0.70      0.69      5496



In [42]:
import numpy as np

In [46]:
new_text = ["i am so sad today","Sooo SAD I will miss you here in San Diego!!!"]
new_text_reshaped = vectorizer.transform(new_text)
new_predictions_svm = svm_classifier.predict(new_text_reshaped)
new_predictions = naive_bayes_classifier.predict(new_text_reshaped)
print("Predictions for new text svm", new_predictions_svm)
print(new_predictions)

Predictions for new text svm [2 2]
[1 1]


In [44]:
import pickle
pickle.dump(naive_bayes_classifier,open('sentimentenglish_model.pkl','wb'))
pickle.dump(svm_classifier,open('sentimentenglish_model_svm.pkl','wb'))
pickle.dump(vectorizer,open('vectorizerenglish.pkl','wb'))