In [62]:
import numpy as np
import pandas as pd
import re #regular expression
from nltk.corpus import stopwords
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import classification_report, f1_score, accuracy_score, recall_score, precision_score, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

In [63]:
#Loading the dataset and naming coluns
columns = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=columns)

# Exploratory data analysis 

In [64]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [66]:
df.tail()

Unnamed: 0,target,ids,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [67]:
df.shape

(1600000, 6)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [70]:
df.dtypes

target     int64
ids        int64
date      object
flag      object
user      object
text      object
dtype: object

In [9]:
#checking for missing values
df.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [71]:
print('Count of columns in the data is:  ', len(df.columns))

Count of columns in the data is:   6


In [72]:
print('Count of rows in the data is:  ', len(df))

Count of rows in the data is:   1600000


In [10]:
#checking distribution of target column

df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [11]:
#Converting Target values from 4 TO 1
df.replace({'target':{4:1}}, inplace=True)

In [12]:
df.drop(columns=['ids','date','flag','user'],axis=1,inplace=True)
df.columns.values[0] = 'sentiment'
df.columns.values[1] = 'tweet'
df.head()

Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [13]:
#Data preprocessing
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))  # Remove URLs
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'@[^\s]+', '', x))  # Remove user mentions
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # Remove non-alphabetic characters

In [14]:
#Convert text to lowercase
df['tweet'] = df['tweet'].str.lower()

In [15]:
df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'\b\d+\b', '', x))  # Remove numbers

In [16]:
df.head()

Unnamed: 0,sentiment,tweet
0,0,awww thats a bummer you shoulda got david ...
1,0,is upset that he cant update his facebook by t...
2,0,i dived many times for the ball managed to sa...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i he...


In [17]:
from nltk.tokenize import word_tokenize
# Load NLTK stopwords
stop_words = set(stopwords.words('english'))
# Removing stopwords from a text
def remove_stopwords(tweet):
    words = word_tokenize(tweet)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the remove_stopwords function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_stopwords)

In [18]:
# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Define a function to tokenize, remove stopwords, and perform stemming on a tweet
def process_tweet(tweet):
    # Remove URLs and user mentions
    tweet = re.sub(r'http\S+|@[^\s]+', '', tweet)
    
    # Tokenize the tweet
    words = word_tokenize(tweet)
    
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Perform stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    
    return stemmed_words

# Apply the process_tweet function to the 'tweet' column
df['tokenized_and_stemmed_tweet'] = df['tweet'].apply(process_tweet)
df.drop(columns=['tokenized_and_stemmed_tweet'], inplace=True)

In [28]:
df.head()

Unnamed: 0,sentiment,tweet
0,0,awww thats bummer shoulda got david carr third...
1,0,upset cant update facebook texting might cry r...
2,0,dived many times ball managed save rest go bounds
3,0,whole body feels itchy like fire
4,0,behaving im mad cant see


In [29]:
X=df['tweet'].values
y=df['sentiment'].values

In [30]:
#Spitting data to training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
print(X_train)

['ya quotid like palm pre touchstone charger readynow yes sounds good beer ready prelaunch'
 'felt earthquake afternoon seems epicenter' 'ruffles shirts like likey'
 ... 'rather average'
 'pickin waitin hurry upi odeeee missed dem table talk nitelol bout fat'
 'home studying maths wooot im going fail shit']


In [32]:
print(X_test)

['ahhh hope ok' 'cool tweet apps razr'
 'know family drama lamehey next time u hang kim n u guys like sleepover whatever ill call u'
 ... 'thanks link rad' 'grounded weekend really dont care best show ever'
 'uploads gives broken link']


In [33]:
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [34]:
df.head()

Unnamed: 0,sentiment,tweet
0,0,awww thats bummer shoulda got david carr third...
1,0,upset cant update facebook texting might cry r...
2,0,dived many times ball managed save rest go bounds
3,0,whole body feels itchy like fire
4,0,behaving im mad cant see


In [35]:
#feature_extraction(text data to numeric data)

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [36]:
print(X_train)

  (0, 235166)	0.35964102210924837
  (0, 253823)	0.16955962104852562
  (0, 27951)	0.22135987021419296
  (0, 117831)	0.1154625168526158
  (0, 287399)	0.18320421581083393
  (0, 356029)	0.16304374125473287
  (0, 253860)	0.4225771275893855
  (0, 50822)	0.2634945285803247
  (0, 316745)	0.39026860364710153
  (0, 234752)	0.25766496677323697
  (0, 221428)	0.26284387182204494
  (0, 169772)	0.11947906972104297
  (0, 245178)	0.36988419459793875
  (0, 353645)	0.1804188432956967
  (1, 91190)	0.6350072699704031
  (1, 271213)	0.33386641638985104
  (1, 4211)	0.3459298645152091
  (1, 85959)	0.4752578772522855
  (1, 100006)	0.37384684302247984
  (2, 169958)	0.5289897992671001
  (2, 275571)	0.4728760297189555
  (2, 263867)	0.6719589584189485
  (2, 169772)	0.21220087414785244
  (3, 312979)	0.16186217745295556
  (3, 347528)	0.16364078231529966
  :	:
  (1279995, 336920)	0.22780762878716904
  (1279996, 176440)	0.8462994083219723
  (1279996, 117831)	0.5327075290193294
  (1279997, 19600)	0.7976087260474487
  (1

In [37]:
print(X_test)

  (0, 213706)	0.535851897571431
  (0, 136504)	0.4618358799730387
  (0, 5679)	0.7068029172539015
  (1, 321474)	0.3480985470008525
  (1, 253396)	0.7231318827132631
  (1, 62131)	0.3327049903120177
  (1, 14694)	0.49519195389358545
  (2, 342014)	0.323005494619789
  (2, 310964)	0.1822107854668903
  (2, 281751)	0.3900979709092207
  (2, 204318)	0.2276951158020374
  (2, 169772)	0.16886125117481474
  (2, 161552)	0.18593237751726607
  (2, 160088)	0.3818610683532291
  (2, 142584)	0.20559736546629873
  (2, 126375)	0.3000588010130276
  (2, 123455)	0.2398126310301309
  (2, 97522)	0.265620818931066
  (2, 82342)	0.34743589384491386
  (2, 45391)	0.2616496937144713
  (3, 346683)	0.2554925940690815
  (3, 294998)	0.2928090894762417
  (3, 294753)	0.2757229769585521
  (3, 268991)	0.48892927780921697
  (3, 259329)	0.3949570760233513
  :	:
  (319994, 54058)	0.3505687681868391
  (319994, 51409)	0.20674887423991917
  (319994, 45009)	0.4166586644226829
  (319995, 357954)	0.41990727309440223
  (319995, 321785)	0.5

In [38]:
#Logistic Regression
model = LogisticRegression(max_iter=1000)

In [39]:
model.fit(X_train, y_train)

In [40]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)

In [41]:
print('Accuracy score on training data : ' , training_data_accuracy)

Accuracy score on training data :  0.80246875


In [42]:
#Evaluating the model (Accuracy Score on test data)

X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, X_test_prediction)

In [43]:
print('Accuracy score on test data : ' , test_data_accuracy)

Accuracy score on test data :  0.7838625


In [52]:
conf_matrix = confusion_matrix(y_test, X_test_prediction)
precision = precision_score(y_test, X_test_prediction, average='binary')  # for binary classification
recall = recall_score(y_test, X_test_prediction, average='binary')
f1 = f1_score(y_test, X_test_prediction, average='binary')
conf_matrix = confusion_matrix(y_test, X_test_prediction)

print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)
print('Confusion Matrix:\n', conf_matrix)

Precision: 0.7753255365324331
Recall: 0.8012846871768033
F1-score: 0.7880914009792087
Confusion Matrix:
 [[122225  37269]
 [ 31895 128611]]


In [54]:
# Classification Report
classification_rep = classification_report(y_test, X_test_prediction)
print('Classification Report:\n', classification_rep)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78    159494
           1       0.78      0.80      0.79    160506

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



In [73]:
# Making predictions for a new input
new_input = ["I dont like swimming"]
new_input_transformed = vectorizer.transform(new_input)  
prediction_new = model.predict(new_input_transformed)
print('\nPrediction for the new input:', prediction_new)


Prediction for the new input: [0]


In [45]:
from sklearn.naive_bayes import MultinomialNB

# Training Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Model evaluation on training data
X_train_nb_prediction = nb_model.predict(X_train)
training_data_nb_accuracy = accuracy_score(y_train, X_train_nb_prediction)


In [46]:
print('Naive Bayes - Accuracy score on training data:', training_data_nb_accuracy)


Naive Bayes - Accuracy score on training data: 0.80934375


In [48]:
# Model evaluation on test data
X_test_nb_prediction = nb_model.predict(X_test)
test_data_nb_accuracy = accuracy_score(y_test, X_test_nb_prediction)
print('Naive Bayes - Accuracy score on test data:', test_data_nb_accuracy)

Naive Bayes - Accuracy score on test data: 0.764875


In [50]:
from sklearn.metrics import confusion_matrix
# Additional metrics for evaluation
print('Naive Bayes - Classification Report on test data:\n', classification_report(y_test, X_test_nb_prediction))
print('Naive Bayes - Confusion Matrix on test data:\n', confusion_matrix(y_test, X_test_nb_prediction))

Naive Bayes - Classification Report on test data:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77    159494
           1       0.78      0.74      0.76    160506

    accuracy                           0.76    320000
   macro avg       0.77      0.76      0.76    320000
weighted avg       0.77      0.76      0.76    320000

Naive Bayes - Confusion Matrix on test data:
 [[125451  34043]
 [ 41197 119309]]


In [78]:
nb_test_prediction = nb_model.predict(X_test)  
nb_precision = precision_score(y_test, nb_test_prediction, average='binary')
nb_recall = recall_score(y_test, nb_test_prediction, average='binary')
nb_f1 = f1_score(y_test, nb_test_prediction, average='binary')

print('Naive Bayes - Precision: ', nb_precision)
print('Naive Bayes - Recall: ', nb_recall)
print('Naive Bayes - F1 Score: ', nb_f1)

Naive Bayes - Precision:  0.778007459961396
Naive Bayes - Recall:  0.7433304673968575
Naive Bayes - F1 Score:  0.7602737543729967


In [51]:
# Make predictions for a new input
new_input = ["I really like doing my work!"]
new_input_transformed = vectorizer.transform(new_input) 
nb_prediction_new = nb_model.predict(new_input_transformed)

print('\nPrediction using Naive Bayes for the new input:', nb_prediction_new)


Prediction using Naive Bayes for the new input: [1]


In [87]:
import sklearn
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve
# Plotting confusion matrix for Logistic Regression
plot_confusion_matrix(model, X_test, y_test)
plt.title('Confusion Matrix for Logistic Regression')
plt.show()


ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\Users\saura\AppData\Roaming\Python\Python311\site-packages\sklearn\metrics\__init__.py)

In [81]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/7b/0e/25d6b5678ed3c7e12bc94d047d0e9492e89cc78b7ea0034ac0f1cf2ff304/scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/10.6 MB 660.6 kB/s eta 0:00:17
    --------------------------------------- 0.2/10.6 MB 1.7 MB/s eta 0:00:07
   - -------------------------------------- 0.3/10.6 MB 2.3 MB/s eta 0:00:05
   - -------------------------------------- 0.5/10.6 MB 2.3 MB/s eta 0:00:05
   -- ------------------------------------- 0.6/10.6 MB 2.4 MB/s eta 0:00:05
   -- ------------------------------------- 0.7/10.6 MB 2.5 MB/s eta 0

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\saura\\AppData\\Roaming\\Python\\Python311\\site-packages\\~klearn\\.libs\\msvcp140.dll'
Consider using the `--user` option or check the permissions.



In [83]:
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve
import seaborn as sns

# Assuming 'model' is your trained Logistic Regression model
# Also assuming 'X_test' and 'y_test' are your test data and labels

# Get predictions
y_pred = model.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix using seaborn
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\Users\saura\AppData\Roaming\Python\Python311\site-packages\sklearn\metrics\__init__.py)

In [85]:
import sklearn
print(sklearn.__version__)

1.4.0


In [86]:
pip install --upgrade scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/7b/0e/25d6b5678ed3c7e12bc94d047d0e9492e89cc78b7ea0034ac0f1cf2ff304/scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl.metadata
  Using cached scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Using cached scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl (10.6 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.0
    Uninstalling scikit-learn-1.3.0:
      Successfully uninstalled scikit-learn-1.3.0
Successfully installed scikit-learn-1.4.1.post1
Note: you may need to restart the kernel to use updated packages.
