In [1]:
import nltk
import torch
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from transformers import BertTokenizer, BertForSequenceClassification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from scikeras.wrappers import KerasClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle



  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import pandas as pd 

df = pd.read_csv('dataset/train.csv')
print(df.head())
print(len(df))

df.isnull().sum()
df = df.fillna(' ')
print(df.isnull().sum())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  
20800
id        0
title     0
author    0
text      0
label     0
dtype: int64


In [11]:
nltk.download("stopwords")
nltk.download('wordnet')
nltk.download('punkt')

def preprocessing(msg):
    # Tokenization
    tokens = nltk.word_tokenize(msg)
    # Lowercasing
    tokens = [word.lower() for word in tokens]
    # Removing stop words
    stop_words = set(nltk.corpus.stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = ' '.join(tokens)
    return tokens

df['processed_text'] = df['text'].apply(preprocessing)

print(df[['text', 'processed_text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jahnav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jahnav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jahnav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                text  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1  Ever get the feeling your life circles the rou...   
2  Why the Truth Might Get You Fired October 29, ...   
3  Videos 15 Civilians Killed In Single US Airstr...   
4  Print \nAn Iranian woman has been sentenced to...   

                                      processed_text  
0  house dem aide : ’ even see comey ’ letter jas...  
1  ever get feeling life circle roundabout rather...  
2  truth might get fired october 29 , 2016 tensio...  
3  video 15 civilian killed single u airstrike id...  
4  print iranian woman sentenced six year prison ...  


In [12]:
X = df['processed_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()

# Transform text data to TF-IDF features
tf_idf_matrix_train = tfidf_vectorizer.fit_transform(X_train)
tf_idf_matrix_test = tfidf_vectorizer.transform(X_test)

# Initialize SVM classifier
logreg = LogisticRegression()
logreg.fit(tf_idf_matrix_train, y_train)
accuracy_logreg = logreg.score(tf_idf_matrix_test, y_test)
print("Logistic Regression Accuracy:", accuracy_logreg * 100)

# Naive Bayes
NB = MultinomialNB()
NB.fit(tf_idf_matrix_train, y_train)
accuracy_NB = NB.score(tf_idf_matrix_test, y_test)
print("Naive Bayes Accuracy:", accuracy_NB * 100)

# Decision Tree
clf = DecisionTreeClassifier()
clf.fit(tf_idf_matrix_train, y_train)
accuracy_decision_tree = clf.score(tf_idf_matrix_test, y_test)
print("Decision Tree Accuracy:", accuracy_decision_tree * 100)


Logistic Regression Accuracy: 95.0
Naive Bayes Accuracy: 88.07692307692308
Decision Tree Accuracy: 89.15865384615384


In [13]:

logistic_regression_model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])
svm_model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(probability=True))
])

rf_model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=100))
])
# bert_model_name = "bert-base-uncased"
# bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
# bert_model = BertForSequenceClassification.from_pretrained(bert_model_name)

# bert_classifier = pipeline('sentiment-analysis', model=bert_model, tokenizer=bert_tokenizer, device=0 if torch.cuda.is_available() else -1)

# max_words = 1000  
# max_sequence_length = 50
# lstm_model = Sequential([
#     # Embedding(max_words, 64, input_length=max_sequence_length),
#     LSTM(64),
#     Dense(1, activation='sigmoid')
# ])
# lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# lstm_classifier = KerasClassifier(build_fn=lstm_model, epochs=10, batch_size=32, verbose=0)

voting_classifier = VotingClassifier(estimators=[
    ('lr', logistic_regression_model),
    ('svm', svm_model),
    ('rf', rf_model)
    # ('bert', bert_classifier)
    # ('lstm', lstm_classifier)
], voting='soft') 



X = df['processed_text']
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

voting_classifier.fit(X_train, y_train)

with open('model.pkl', 'wb') as f:
    pickle.dump(voting_classifier, f)


In [14]:
y_pred = voting_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy*100)

96.22596153846153
