In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import re 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords= nltk.corpus.stopwords.words('english')
from nltk.stem import PorterStemmer,WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("tweet.csv")
df.head()

Unnamed: 0,tweets,class
0,Be aware dirty step to get money #staylight ...,figurative
1,#sarcasm for #people who don't understand #diy...,figurative
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative
3,@wilw Why do I get the feeling you like games?...,figurative
4,-@TeacherArthurG @rweingarten You probably jus...,figurative


In [3]:
# Basic Text Cleaning

def clean_text(text):
    
    text=text.lower() # Lower Case
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r"@\w+|\#", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^a-zA-Z]", " ", text) # Remove special characters and numbers, keep only alphabetic characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return "".join(text)

In [4]:
# Tokenization
def tokenize_text(text):
    return word_tokenize(text)

In [5]:
# Stopword Removal
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords]

In [6]:
# Lemmatization
def lemmatize_text(tokens):
    lm=WordNetLemmatizer()
    return " ".join([lm.lemmatize(token,pos='v') for token in tokens])

In [7]:
df["cleaned_tweets"]=df["tweets"].apply(clean_text)
df['tokens'] = df['cleaned_tweets'].apply(tokenize_text)
df['tokens'] = df['tokens'].apply(remove_stopwords)
df['lemmatized_tokens'] = df['tokens'].apply(lemmatize_text)

In [8]:
Xfeatures=df['lemmatized_tokens']
ylabels=df['class']
#train test split
x_train,x_test,y_train,y_test=train_test_split(Xfeatures,ylabels,test_size=0.2,random_state=41)

In [9]:
from sklearn.pipeline import Pipeline
# Naive bayes Pipeline
pipe_lr=Pipeline(steps=[('cv',CountVectorizer()),('mnb',MultinomialNB())])
#pipe_lr=Pipeline(steps=[('tf',TfidfVectorizer()),('bnb',BernoulliNB())])

In [10]:
pipe_lr.fit(x_train,y_train)
pipe_lr.score(x_test,y_test)

0.6276870163370594

In [11]:
import joblib
pipeline_file=open('trial_tweet.pkl',"wb")
joblib.dump(pipe_lr,pipeline_file)
pipeline_file.close()