In [2]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# Get shape and head
train.shape
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [14]:
# check if train data has null
train.isnull().sum()

id           0
title      558
author    1918
text         0
label        0
dtype: int64

In [13]:
# remove nan rows if text column has null
train.dropna(subset=['text'], inplace=True)

In [6]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [16]:
# check if test data has null
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [19]:
# remove nan rows if text column has null
test.dropna(subset=['text'], inplace=True)

In [21]:
# Get the labels
labels = train.label
labels.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [25]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(train['text'], labels, test_size=0.2, random_state=7)


In [28]:
# DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# DataFlair - Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer.transform(X_test)


In [29]:
# Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)


# Predict on the test set and calculate accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 95.95%


In [32]:
# Build confusion matrix
confusion_matrix(y_test, y_pred) #, labels=['1','0'])

array([[1953,   74],
       [  94, 2032]])

In [35]:
# get the data from test dataset and get final predictions
tfidf_test = tfidf_vectorizer.transform(test['text'])
final_predictions = pac.predict(tfidf_test)

In [39]:
output = pd.DataFrame({'id': test.id, 'label': final_predictions})
output.to_csv('output.csv', index=False)
print("Output file saved.")

Your submission was successfully saved!
