In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

In [2]:
df_train = pd.read_csv('train_E6oV3lV.csv', header = None)

In [3]:
df_train.head()

Unnamed: 0,0,1,2
0,id,label,tweet
1,1,0,@user when a father is dysfunctional and is s...
2,2,0,@user @user thanks for #lyft credit i can't us...
3,3,0,bihday your majesty
4,4,0,#model i love u take with u all the time in ...


In [4]:
# Renaming the columns for better understanding
df_train.columns = ['ID','Sentiment', 'Text']

In [5]:
df_train.columns

Index(['ID', 'Sentiment', 'Text'], dtype='object')

In [6]:
df_test = pd.read_csv('test_tweets_anuFYb8.csv', header = None)

In [7]:
df_test.head()

Unnamed: 0,0,1
0,id,tweet
1,31963,#studiolife #aislife #requires #passion #dedic...
2,31964,@user #white #supremacists want everyone to s...
3,31965,safe ways to heal your #acne!! #altwaystohe...
4,31966,is the hp and the cursed child book up for res...


In [9]:
df_test.columns = ['ID','Text']

In [10]:
df_test.columns

Index(['ID', 'Text'], dtype='object')

In [11]:
df_test.isnull().sum()

ID      0
Text    0
dtype: int64

In [12]:
df_train.isnull().sum()

ID           0
Sentiment    0
Text         0
dtype: int64

In [13]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91932\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_symbols(text):
    pattern = r'[^A-Za-z\s]'
    text = re.sub(pattern, '', text)
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Clean text column
df_train['Text'] = df_train['Text'].apply(lambda x: remove_html_tags(x))
df_train['Text'] = df_train['Text'].apply(lambda x: remove_symbols(x))

df_test['Text'] = df_test['Text'].apply(lambda x: remove_html_tags(x))
df_test['Text'] = df_test['Text'].apply(lambda x: remove_symbols(x))



In [19]:
# split data
X_train, X_test, y_train, y_test = train_test_split(df_train["Text"], df_train["Sentiment"], test_size=0.20, random_state=0)

In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [21]:
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)

In [22]:
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)

In [23]:
print(f'Acc: {round(score*100, 2)}%')

Acc: 96.23%


In [27]:
confusion_matrix(y_test, y_pred)

array([[5857,   93],
       [ 148,  295]], dtype=int64)