In [1]:
#installing kaggle
!pip install kaggle



In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Importing dataset

In [4]:
#API to fetch the dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:04<00:00, 26.3MB/s]
100% 80.9M/80.9M [00:04<00:00, 20.2MB/s]


In [5]:
#extracting the compressed dataset
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('the dataset is extracted')

the dataset is extracted


Importing dependencies

In [6]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# printing the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
# loading the data from csv file to pandas dataframe
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')

In [10]:
twitter_data.shape

(1599999, 6)

In [11]:
# printing the first 5 rows of the dataframe
column_names = ['target','id','date','flag','user','text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',names=column_names, encoding = 'ISO-8859-1')

In [12]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
#counting the number of missing values in the dataset
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [14]:
# is there any duplicate
twitter_data.duplicated().sum()

0

In [15]:
# checking the number of missing values in the dataset
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


Convert the target "4" to "1"

In [16]:
twitter_data.replace({'target':{4:1}},inplace=True)

In [17]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


0---> Negative Tweet
1---> Positive Tweet

**Stemming** \
Stemming is the process of reducing a word to its Root word.\
example: actor,actress,acting = act

In [18]:
port_stem = PorterStemmer()

In [19]:
def stemming(content):
  content = " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in content.split())
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [20]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [21]:
twitter_data.tail()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
1599995,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...,woke school best feel ever
1599996,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,thewdb com cool hear old walt interview
1599997,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,readi mojo makeov ask detail
1599998,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,happi th birthday boo alll time tupac amaru sh...
1599999,1,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...,happi charitytuesday thenspcc sparkschar speak...


In [22]:
print(twitter_data['stemmed_content'])

0          switchfoot awww bummer shoulda got david carr ...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996              thewdb com cool hear old walt interview
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [23]:
#seperating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [24]:
print(X)

['switchfoot awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


Splitting the data to training data test data

In [25]:
X_train, X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y ,random_state=2)

In [26]:
print(X.shape,X_train.shape,X_test.shape)

(1600000,) (1280000,) (320000,)


In [27]:
#vectorization

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

**Training the Machine Learning Model**

Logistic Regression

In [30]:
model = LogisticRegression(max_iter=1000)

In [31]:
model.fit(X_train,Y_train)

**Model Evaluation**

Accuracy score

In [32]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)

In [33]:
print('Accuracy score on the training data: ',training_data_accuracy)

Accuracy score on the training data:  0.8004296875


In [34]:
# accuracy score on the training data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,X_test_prediction)

In [35]:
print('Accuray score on the test data: ',test_data_accuracy)

Accuray score on the test data:  0.7770625


Model accuracy = 77.7%

Saving the trained model

In [5]:
import pickle
filename = '../static/model/trained_model.sav'
pickle.dump(model, open(filename,'wb'))

NameError: name 'model' is not defined

Using the saved model for future prediction

In [37]:
#loading the saved model
loaded_model = pickle.load(open('/content/trained_model.sav','rb'))

In [41]:
X_new = X_test[200]
print(Y_test[200])

prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
  print('Negative Tweet')
else:
  print('Posistive Tweet')


1
[1]
Posistive Tweet


In [1]:
X_new = X_test[3]
print(Y_test[3])

prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
  print('Negative Tweet')
else:
  print('Posistive Tweet')


NameError: name 'X_test' is not defined

In [2]:
# Prediction pipeline function
def predict_sentiment(text):
    # Preprocess and stem the input text
    cleaned_text = stemming(text)
    
    # Vectorize the cleaned text
    vectorized_text = vectorizer.transform([cleaned_text])
    
    # Predict the sentiment using the trained model
    prediction = model.predict(vectorized_text)[0]
    
    # Return human-readable sentiment
    if prediction == 1:
        return "Positive"
    else:
        return "Negative"

In [3]:
text = "a little bit bad product"
prediction = predict_sentiment(text)
print(f"The sentiment is: {prediction}")

NameError: name 'stemming' is not defined