# Importing Libraries

In [1]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [2]:
# named the columns
columns_name=["Tweet ID" , "entity" , "sentiment" , "Tweet"]

In [3]:
# load data
twitter_data = pd.read_csv("E:\\DATA\\twitter_training.csv", header= None, names=columns_name)

In [4]:
twitter_data

Unnamed: 0,Tweet ID,entity,sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [5]:
twitter_data.shape

(74682, 4)

In [6]:
twitter_data.isnull().sum()

Tweet ID       0
entity         0
sentiment      0
Tweet        686
dtype: int64

In [7]:
twitter_data.duplicated().sum()

2700

In [8]:
twitter_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet ID   74682 non-null  int64 
 1   entity     74682 non-null  object
 2   sentiment  74682 non-null  object
 3   Tweet      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


### Preprocessing

In [12]:
twitter_data= twitter_data.dropna()

In [13]:
twitter_data=twitter_data.drop_duplicates()

In [15]:
twitter_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71656 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet ID   71656 non-null  int64 
 1   entity     71656 non-null  object
 2   sentiment  71656 non-null  object
 3   Tweet      71656 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.7+ MB


### Make the target column [0,1] 
Negative and Irrelevant  = 0
Positive and Neutral     = 1

In [16]:
twitter_data["sentiment"].value_counts()

Negative      21698
Positive      19713
Neutral       17708
Irrelevant    12537
Name: sentiment, dtype: int64

In [19]:
twitter_data.replace({"sentiment":{"Negative":0}}, inplace=True)
twitter_data.replace({"sentiment":{"Irrelevant":0}}, inplace=True)
twitter_data.replace({"sentiment":{"Positive":1}}, inplace=True)
twitter_data.replace({"sentiment":{"Neutral":1}}, inplace=True)

In [20]:
twitter_data["sentiment"].value_counts()

1    37421
0    34235
Name: sentiment, dtype: int64

In [22]:
twitter_data.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet
0,2401,Borderlands,1,im getting on borderlands and i will murder yo...
1,2401,Borderlands,1,I am coming to the borders and I will kill you...
2,2401,Borderlands,1,im getting on borderlands and i will kill you ...
3,2401,Borderlands,1,im coming on borderlands and i will murder you...
4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...


# Stemming

In [23]:
port_stem= PorterStemmer()

In [27]:
def stemming(content):
    stem_tweet= re.sub("[^a-zA-Z]", " " , content)
    stem_tweet= stem_tweet.lower()
    stem_tweet= stem_tweet.split()
    stem_tweet= [port_stem.stem(word) for word in stem_tweet if not word in stopwords.words("english")]
    stem_tweet= " ".join(stem_tweet)
    return stem_tweet

In [28]:
twitter_data["stem_tweet"]= twitter_data["Tweet"].apply(stemming)

In [29]:
twitter_data.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet,stem_tweet
0,2401,Borderlands,1,im getting on borderlands and i will murder yo...,im get borderland murder
1,2401,Borderlands,1,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,1,im getting on borderlands and i will kill you ...,im get borderland kill
3,2401,Borderlands,1,im coming on borderlands and i will murder you...,im come borderland murder
4,2401,Borderlands,1,im getting on borderlands 2 and i will murder ...,im get borderland murder


# Spliting

In [33]:
X= twitter_data["stem_tweet"].values
y= twitter_data["sentiment"].values

In [36]:
x_train , x_test , y_train , y_test= train_test_split(X , y , test_size=0.2 , random_state=42 , stratify=y)

In [37]:
print(X.shape , x_train.shape , x_test.shape)

(71656,) (57324,) (14332,)


# TF-IDF Vectorizer


In [39]:
tf_idf= TfidfVectorizer()
x_train= tf_idf.fit_transform(x_train)
x_test= tf_idf.transform(x_test)

In [45]:
print(x_train)

  (0, 204)	0.24916240404504775
  (0, 18205)	0.13316406013730092
  (0, 2226)	0.19032995645426393
  (0, 17431)	0.1368270273632818
  (0, 3455)	0.18038037328772383
  (0, 10254)	0.29262976330967333
  (0, 2384)	0.2377044964660522
  (0, 9094)	0.18155947083148374
  (0, 2756)	0.24916240404504775
  (0, 14954)	0.2878671945557727
  (0, 14243)	0.23727238394414812
  (0, 149)	0.15727687639625823
  (0, 20056)	0.14921314823501416
  (0, 16998)	0.2290347469649889
  (0, 2823)	0.21338426352418505
  (0, 12807)	0.1989376043430977
  (0, 6222)	0.2949054463382058
  (0, 14222)	0.3318379187940035
  (0, 6796)	0.09248852220327657
  (0, 15182)	0.1882422388907538
  (0, 13837)	0.17525057137570654
  (2, 16956)	0.42275726345631104
  (2, 3723)	0.3794464844343122
  (2, 14902)	0.34375516321008975
  (2, 12474)	0.4285528870820026
  :	:
  (57321, 7717)	0.24356690317995014
  (57321, 104)	0.4250215104274084
  (57321, 19027)	0.20864650988810116
  (57321, 12652)	0.1851066508853636
  (57321, 11794)	0.20995442602575687
  (57321, 13

# Logistic Regression

In [46]:
logreg= LogisticRegression()

In [48]:
logreg.fit(x_train , y_train)
train_pred= logreg.predict(x_train)
test_pred= logreg.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Accuracy


In [50]:
train_acc= accuracy_score(y_train, train_pred)
test_acc= accuracy_score(y_test, test_pred)
print("The Accuracy in training is = ", train_acc)
print("The Accuracy in testing is = ", test_acc)

The Accuracy in training is =  0.8584885911660037
The Accuracy in testing is =  0.8233324030142338
