## Upload kaggle.json file & Importing the dataset

In [18]:
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

In [19]:
# !kaggle datasets download -d kazanova/sentiment140

In [20]:
# from zipfile import ZipFile
# dataset = '/content/sentiment140.zip'

# with ZipFile(dataset, 'r') as zip:
#     zip.extractall()
#     print("The dataset is extracted")

## Importing the dependencies

In [21]:
import pickle
import re

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer as ps
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [22]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data processing

In [23]:
# Name the columns

column_names = ['target', 'id', 'date', 'flag', 'user', 'text']

# Loading the data

twitter_data = pd.read_csv('data.csv', names = column_names)

In [24]:
# Checking the data points

twitter_data.shape

(50002, 6)

In [25]:
# Count the number of missing values from the dataset

twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [26]:
# Checking the distribution of target column

twitter_data['target'].value_counts()

0    25001
4    25001
Name: target, dtype: int64

In [27]:
# Convert the target from "4" to "1"
# 0 --> Negative
# 1 --> Positive

twitter_data.replace({'target': {4:1}}, inplace = True)

In [28]:
# Stemming

stemmer = ps()
def stemming(content):
    stemmed = re.sub('[^a-zA-Z]', ' ', content)
    stemmed = stemmed.lower()
    stemmed = stemmed.split()
    stemmed = [stemmer.stem(word) for word in stemmed if word not in stopwords.words('english')]
    stemmed = ' '.join(stemmed)

    return stemmed

In [29]:
twitter_data['stemmed'] = twitter_data['text'].apply(stemming)

In [30]:
# Separating the data and label

X = twitter_data['stemmed'].values
y = twitter_data['target'].values

## Splitting the data into training and testing data

In [31]:
# Training and Testing data split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 7)

In [32]:
# Converting the textual data into numerical data

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## Training the ML Model

In [33]:
model = LogisticRegression(max_iter = 3000)

In [34]:
model.fit(X_train, y_train)

## Model Evaluation

In [35]:
# Accuracy Score (Training Data)

X_train_pred = model.predict(X_train)
trda = accuracy_score(y_train, X_train_pred)

print("Accuracy Score of the Training Data: ", trda)

Accuracy Score of the Training Data:  0.8340541486462838


In [36]:
# Accuracy Score (Testing Data)
X_test_pred = model.predict(X_test)
teda = accuracy_score(y_test, X_test_pred)

print("Accuracy Score of the Testing Data: ", teda)

Accuracy Score of the Testing Data:  0.7616238376162384


## Save The Trained Model

In [37]:
filename = 'NLPTSA.pkl'
pickle.dump(model, open(filename, 'wb'))

## Loading the Trained Model

In [38]:
model = pickle.load(open('NLPTSA.pkl', 'rb'))

## Using the Trained Model

In [39]:
X_new = X_test[2542]
print(y_test[2542])

prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
    print("Negative Tweet")
else:
    print("Positive Tweet")

1
[1]
Positive Tweet


## Testing on Custom Input

In [41]:
def process_input(input):
    sin = stemming(input)
    tsin = nltk.word_tokenize(sin)
    vsin = vectorizer.transform([' '.join(tsin)])

    return vsin


input = "I love to read novels."

prediction = model.predict(process_input(input))
print(prediction)

if (prediction[0] == 0):
    print("Negative Tweet")
else:
    print("positive Tweet")

[1]
positive Tweet
