In [129]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import nltk
# nltk.download('stopwords')

In [130]:
# printing the stopwords in English
# print(stopwords.words('english'))

In [131]:
df = pd.read_csv('./train.csv')
df.isnull().count()
# replacing the null values with empty string
df = df.fillna('')

In [132]:
df['content']=df['author']+' '+df['title']
# separating the data & label
X = df.drop(columns='label', axis=1)
Y = df['label']

In [133]:
#Stemming is the process of reducing a word to its Root word
prot_stem=PorterStemmer()

In [134]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [PorterStemmer().stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [135]:
df['content']=df['content'].apply(stemming)
print(df['content'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [136]:
x_train=df['content'].values
y_trian=df['label'].values

In [137]:
# converting the textual data to numerical data
vectorizer=TfidfVectorizer()
vectorizer.fit(x_train)
x_train=vectorizer.transform(x_train)
words = vectorizer.get_feature_names_out()
dense_matrix = x_train.toarray()
print(dense_matrix)
print(words[15686])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
tweet


In [138]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_trian, test_size = 0.2, stratify=Y, random_state=2)

In [139]:
model =LogisticRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
for i in range(10):
    print("y_pred:",y_pred[i],"=> target:",y_test[i])
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred)
recall=recall_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)
print(f"accuracy:{accuracy}")
print(f"precision:{precision}")
print(f"recall:{recall}")
print(f"f1:{f1}")

y_pred: 1 => target: 1
y_pred: 0 => target: 0
y_pred: 1 => target: 1
y_pred: 0 => target: 0
y_pred: 0 => target: 0
y_pred: 1 => target: 1
y_pred: 0 => target: 0
y_pred: 1 => target: 0
y_pred: 1 => target: 1
y_pred: 1 => target: 1
accuracy:0.9790865384615385
precision:0.9659197012138189
recall:0.9932789246279404
f1:0.9794082840236686
