In [1]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd

In [15]:
df = pd.read_csv('twitter_training.csv', encoding='latin-1')

In [16]:
df.head()

Unnamed: 0,number,context,sentiment,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [17]:
df['binary_sentiment'] = df['sentiment'].map({'Neutral': 2, 'Positive': 0, 'Negative': 1,'Irrelevant':3})

In [18]:
df.sample()

Unnamed: 0,number,context,sentiment,text,binary_sentiment
34282,6688,Fortnite,Positive,War zone has the worst gun consistency I that ...,0


In [19]:
df = df[['text', 'binary_sentiment']]

In [20]:
df.tail()

Unnamed: 0,text,binary_sentiment
74676,Just realized that the Windows partition of my...,0
74677,Just realized that my Mac window partition is ...,0
74678,Just realized the windows partition of my Mac ...,0
74679,Just realized between the windows partition of...,0
74680,Just like the windows partition of my Mac is l...,0


In [21]:
df.isnull().sum()

text                686
binary_sentiment      0
dtype: int64

In [22]:
df = df.dropna()



In [23]:
df.duplicated().sum()

4229

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73995 entries, 0 to 74680
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   text              73995 non-null  object
 1   binary_sentiment  73995 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ MB


In [25]:
df.shape

(73995, 2)

In [26]:
df['binary_sentiment'].value_counts()

binary_sentiment
1    22358
0    20654
2    18108
3    12875
Name: count, dtype: int64

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [28]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nabin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nabin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    
    # Join the words back into a single string
    processed_text = ' '.join(words)
    
    return processed_text

In [30]:
df['text'] = df['text'].apply(preprocess_text)

In [31]:
df.tail()

Unnamed: 0,text,binary_sentiment
74676,realiz window partit mac like 6 year behind nv...,0
74677,realiz mac window partit 6 year behind nvidia ...,0
74678,realiz window partit mac 6 year behind nvidia ...,0
74679,realiz window partit mac like 6 year behind nv...,0
74680,like window partit mac like 6 year behind driv...,0


In [32]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df['binary_sentiment'])

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], encoded_labels, test_size=0.2, random_state=42)

In [36]:
vectorizer = TfidfVectorizer(max_features=18925)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [36]:
print("Processed Texts:", df['text'])
print("Encoded Labels:", encoded_labels)
print("X_train:", X_train_tfidf)
print("X_test:", X_test_tfidf)
print("y_train:", y_train)
print("y_test:", y_test)

Processed Texts: 0                                           ` respond , go
1                            sooo sad miss san diego ! ! !
2                                           boss bulli ...
3                                    interview ! leav alon
4                son * * * * , ` put releas alreadi bought
                               ...                        
27476    wish could come see u denver husband lost job ...
27477    ` wonder rake . client made clear .net , ` for...
27478    yay good . enjoy break - probabl need hectic w...
27479                                      worth * * * * .
27480             flirt go - atg smile . yay . ( ( hug ) )
Name: text, Length: 27480, dtype: object
Encoded Labels: [2 1 1 ... 0 0 2]
X_train:   (0, 11546)	0.3264729758838995
  (0, 6761)	0.5061459813569884
  (0, 4783)	0.4962708183154321
  (0, 8521)	0.5341264876095198
  (0, 12229)	0.3250473991530014
  (1, 8575)	0.41541355502741845
  (1, 8041)	0.3547812359313191
  (1, 8638)	0.346691805383014


In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [38]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

In [38]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tfidf, y_train)

In [39]:
y_pred = naive_bayes_classifier.predict(X_test_tfidf)

In [40]:
class_names = label_encoder.classes_.astype(str)

In [41]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=class_names)

In [42]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.717143050206095

In [43]:
report = classification_report(y_test, y_pred)
report

'              precision    recall  f1-score   support\n\n           0       0.69      0.81      0.74      4123\n           1       0.65      0.88      0.75      4463\n           2       0.82      0.62      0.71      3589\n           3       0.92      0.44      0.59      2624\n\n    accuracy                           0.72     14799\n   macro avg       0.77      0.69      0.70     14799\nweighted avg       0.75      0.72      0.71     14799\n'

In [44]:
confusion = confusion_matrix(y_test, y_pred)
confusion

array([[3329,  596,  165,   33],
       [ 366, 3907,  158,   32],
       [ 538,  789, 2230,   32],
       [ 594,  709,  174, 1147]], dtype=int64)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming X_train_tfidf, X_test_tfidf, y_train, y_test are already defined

# Initialization of SVM Classifier
svm_classifier = SVC()

# Training the SVM Classifier
svm_classifier.fit(X_train_tfidf, y_train)

# Making Predictions
y_pred_svm = svm_classifier.predict(X_test_tfidf)

# Calculating Accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)

# Creating Confusion Matrix
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)

# Generating Classification Report
class_report_svm = classification_report(y_test, y_pred_svm, target_names=class_names)


In [28]:
accuracy_svm

0.6991633321207712

In [45]:
import numpy as np

In [57]:
new_text = ["i love you"]
new_text_reshaped = vectorizer.transform(new_text)

new_predictions_svm = naive_bayes_classifier.predict(new_text_reshaped)

print("Predictions for new text svm", new_predictions_svm)

Predictions for new text svm [0]


In [58]:
import pickle
pickle.dump(naive_bayes_classifier,open('sentimentenglish_model.pkl','wb'))
pickle.dump(svm_classifier,open('sentimentenglish_model_svm.pkl','wb'))
pickle.dump(vectorizer,open('vectorizerenglish.pkl','wb'))