In [1]:
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
dataframe = pd.read_csv('news.csv')
dataframe.head()

## Split data into X & Y
x = dataframe['text']
y = dataframe['label']

print(x)
print(y)

0       Rajasthan on Wednesday reported 74 new coronav...
1       The total number of coronavirus cases in Delhi...
2       Condoling the demise of actor Rishi Kapoor, PM...
3       Congress leader Rahul Gandhi condoled Rishi Ka...
4       As a sign of respect for healthcare profession...
                              ...                        
4050    Adhering to a policy of "zero tolerance" for c...
4051    While addressing a rally in South Carolina, US...
4052    After AAP government granted sanction to Delhi...
4053    Kamil Siedcynski, a Polish student at Kolkata'...
4054    Police have lodged 22 FIRs against anti-CAA pr...
Name: text, Length: 4055, dtype: object
0       POSITIVE
1       POSITIVE
2       NEGATIVE
3       POSITIVE
4       NEGATIVE
          ...   
4050    POSITIVE
4051    NEGATIVE
4052    POSITIVE
4053    POSITIVE
4054    POSITIVE
Name: label, Length: 4055, dtype: object


In [5]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [6]:
## Now we need to fit the TFIDF Vectorizer.
# max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
# max_df = 25 means "ignore terms that appear in more than 25 documents".

tfvect = TfidfVectorizer(stop_words='english',max_df=0.7)
tfid_x_train = tfvect.fit_transform(x_train)
tfid_x_test = tfvect.transform(x_test)

## Now let's fit the Machine Learning Model

In [7]:
## Now let's fit the Machine Learning Model
classifier = svm.SVC(kernel = 'linear', gamma='auto', C=2)
classifier.fit(tfid_x_train,y_train)

SVC(C=2, gamma='auto', kernel='linear')

In [8]:
##  check model accuracy. 

y_pred = classifier.predict(tfid_x_test)
score = accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

cf = confusion_matrix(y_test,y_pred, labels=['NEGATIVE','POSITIVE'])
print(cf)


Accuracy: 81.26%
[[174  86]
 [ 66 485]]


In [9]:
## Let's create function for test the model on the real-time data.

def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = classifier.predict(vectorized_input_data)
    print(prediction)

In [10]:
import pickle
pickle.dump(classifier,open('model.pkl', 'wb'))