In [None]:
pip install kaggle

In [None]:
!mkdir -p ~/.kaggle

In [None]:
#Importing the kaggle dataset
#Uploading the kaggle.json file
#Setting up the kaggle api

!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
#Importing the Twitter Sentiment Dataset
# API to fetch the dataset from kaggle.
# !kaggle datasets download -d kazanova/sentiment140
!kaggle datasets download -d yasserh/imdb-movie-ratings-sentiment-analysis
#!mark is used to run shell commands

In [None]:
#extracting the zip file to directory, remember that
#csv already exists inside zip file.
from zipfile import ZipFile
data = '/content/imdb-movie-ratings-sentiment-analysis.zip'

with ZipFile(data,'r') as zip:
  zip.extractall()
  print('The dataset is extracted.')


Importing and installing the dependencies.

In [None]:
#Importing and installing the dependencies.
import numpy as np
import pandas as pd
#used to create dataframes, structured table using pandas
import re
#Used for pattern matching re library
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#Used for stemming the words and nltk library is for nlp.
from sklearn.feature_extraction.text import TfidfVectorizer
#Used to convert textual data to numertical data Vectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#We are using logistic regression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))
#Stopwords are the words which don't have any influential meaning.
#These words are not required for out processing.
#Removing the stopwords as they are non-influential.


Data Processing

In [None]:
#Data Processing
#loading the data from csv file to pandas dataframe.
t_data = pd.read_csv('/content/movie.csv', encoding ='ISO-8859-1')


In [None]:
#Checking the number of rows and coloumns in t_data
t_data.shape

#printing the first five rows of the dataframe.
t_data.head()
t_data = t_data.rename(columns ={'ï»¿text':'text'})
print(t_data.head())


In [None]:
#As it is not reading the coloumn names we are naming the coloumns.
#as first data point is considered as coloumn name.
#Naming the coloumn name in t_data.
column_names =['text','id']
t_data = pd.read_csv('/content/movie.csv', names = column_names, encoding ='ISO-8859-1')

In [None]:
t_data.shape

In [None]:
t_data.shape

t_data.head()

In [None]:
t_data.drop(t_data.index[0])

In [None]:
#Read the last 5 tweets as opposite to head
t_data.tail()

In [None]:
#Dealing with missing values, by replacing or dropping
#Checking the values and counting the number of missing values.
t_data.isnull().sum()

In [None]:
#Understanding the distribution of the target variable.
#Checking the distribution of target columns
#Here the dataset is evenly distributed but if not we have to
#perform upsampling or downsampling.
t_data['id'].value_counts()

In [None]:
#Convert target columns value 4 to 1
# inplace means this change should take place in original dataset
#Now 0 is negative tweet and 1 is positive tweet.
# t_data.replace({'target':{4:1}}, inplace = True)
# But I think doing this will bias the model and hence affect accuracy.

In [None]:
#Stemming
#It is the process that is done to reduce the word to its root word (keyword).
# ex. actor, actress, acting.
#solution: act
#ex. choco, chocolatey, chocolates
#solution: chocolate.
pstem = PorterStemmer()

In [None]:
def stemming(content):
  #^ means remove everything except the given i.e., a-zA-Z
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [pstem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  #As mentioned earlier stopwords are the words which don't have any influential meaning .
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [None]:
#Applying the function
t_data['stemmed_content'] = t_data['text'].apply(stemming)

In [None]:
t_data.head()


In [None]:
print(t_data['stemmed_content'])

In [None]:
t_data['id'].value_counts()

In [None]:
t_data = t_data.drop(t_data.index[0])

In [None]:
# t_data = t_data.drop(label= "label", axis = 0)

In [None]:
#seperating data and label
X = t_data['stemmed_content'].values
Y = t_data['id'].values

In [None]:
print(X)

In [None]:
print(Y)

Splitting data into training and testing data

In [None]:
t_data['id'].value_counts()

In [None]:
#Splitting data into training and testing data
# x_train will contain all the training data tweets.
# y_train will contain all the training data targets(LABELS).
# x_test will contain all the testing data tweets.
# y_test will contain all the testing data targets(LABELS).
#If we don't set stratify then there is the chance of unfair splitting
#regarding the random state, each time you split data, it will be split differently.
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, stratify = Y, random_state = 2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
#Converting the text to numerical value using feature extraction.
#Using method called Vectorizer.
#Prioritize according to frequency.
#Use the fit_transform method for training data alone not for test data.
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

#The first element 0 means words are in 0th tweet and so on.
#The second element in the output means,

Training the Logistic Regression Model.
(One of the model in the ML used for binary classification and predictive analytics)

In [None]:
model = LogisticRegression(max_iter = 1000)

Model Training phase

In [None]:
#Model will learn and train from here
model.fit(X_train, Y_train)

Accuracy Score

In [None]:
#Accuracy score on the training data.
X_train_prediction =model.predict(X_train)
#In the above statement model will predict target on training data  given only training data without labels.
#In the below statement we will calculate the accuracy by comparing the generated labels
#with the actual labels.
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [None]:
print("Accuracy Score on the training data: ", training_data_accuracy)


In [None]:
Y_test_prediction =model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,Y_test_prediction)

Here as the accuracy of training and test data is close, we can say that model has performed pretty well.

Overfitting: Training data accuracy is much more than test data accuracy.
ex. training data accuracy = 80% and test data accuracy = 40-50%


Underfitting: Model performs poorly in both training and testing data, shows that the model is too simple for the data, need to design more complex model.

In [None]:
print(f"Accuracy Score on Test Data: {test_data_accuracy *100:.3f} %")

In [None]:
#Saving the model so that you can use it without training later.
import pickle

filename = "modellr.sav"
#dumping the model, wb means write in binary format.
pickle.dump(model, open(filename, 'wb'))


In [None]:
#fetching the saved file.and using the already trained model directly for prediction.
#Using the model for new predictions.
#Load the model.
load_model = pickle.load(open('/content/modellr.sav', 'rb'))

In [None]:
#This is nothing but the 200th data point
X_new = X_test[7999]
print(Y_test[7999])

In [None]:
prediction = load_model.predict(X_new)
print(*prediction)

if (prediction[0]=='0'):
  print("It is a negative review")
else:
  print("It is a positive review")

In [None]:
X_new = X_test[2]
print(Y_test[2])
prediction = load_model.predict(X_new)

if (prediction[0]==0):
  print("It is a negative tweet")
else:
  print("It is a positive tweet")

In [None]:
#Here our project is complete.
