In [15]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [45]:
df=pd.read_csv("../data/raw data/twitt30k.csv")
df.head(20)

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
5,Really upset http://myloc.me/5x1T,0
6,@lilyroseallen big pool or paddling pool?! mig...,1
7,@arianna_skye Hee! I did tweet... And you'...,1
8,is happy to have Tickets for the Concerts,1
9,"@mileycyrus http://twitpic.com/78urd - Miley, ...",1


## <a name="p3">Preprocess Text</a>
**Text Preprocessing** is traditionally an important step for **Natural Language Processing (NLP)** tasks. It transforms text into a more digestible form so that machine learning algorithms can perform better.

**The Preprocessing steps taken are:**
1. **Lower Casing:** Each text is converted to lowercase.
2. **Replacing URLs:** Links starting with **"http" or "https" or "www"** are replaced by **"URL"**.
3. **Replacing Emojis:** Replace emojis by using a pre-defined dictionary containing emojis along with their meaning. *(eg: ":)" to "EMOJIsmile")*
4. **Replacing Usernames:** Replace @Usernames with word **"USER"**. *(eg: "@Nilay" to "USER")*
5. **Removing Non-Alphabets:** Replacing characters except Digits and Alphabets with a space.
6. **Removing Consecutive letters:** 3 or more consecutive letters are replaced by 2 letters. *(eg: "Heyyyy" to "Heyy")*
7. **Removing Short Words:** Words with length less than 2 are removed.
8. **Removing Stopwords:** Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. *(eg: "the", "he", "have")*
9. **Lemmatizing:** Lemmatization is the process of converting a word to its base form. *(e.g: “Great” to “Good”)*

In [4]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}


In [6]:
sw=stopwords.words('english')
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [7]:
len(sw)

179

In [8]:
#RegEx
import re

In [9]:
s1="See you all IN 2023"
re.findall("[a-z]",s1)

['e', 'e', 'y', 'o', 'u', 'a', 'l', 'l']

In [10]:
re.findall("[A-Z]",s1)

['S', 'I', 'N']

In [11]:
re.findall("[0-9]",s1)

['2', '0', '2', '3']

In [12]:
re.sub("[0-9]","*",s1)

'See you all IN ****'

In [21]:
tweets_list=list(df["twitts"])
sentiment_list=list(df["sentiment"])

In [22]:
len(tweets_list)

30000

In [28]:
def preprocess(tweets):
    wnl=WordNetLemmatizer()

    #Define RegEx
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    #patterns for 3 or more consecutive letters by 2 letter.
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1"
    #contraction to expansion
    #removal of html tags
    processed_tweet=[]
    for tweet in tweets:
        temp_tweet=[]
        for word in tweet.split():
            if word not in sw:
                word=wnl.lemmatize(word)
                temp_tweet.append(word)
        
        tweet=" ".join(temp_tweet)


        tweet=tweet.lower()
        tweet=re.sub(urlPattern," URL",tweet)
        tweet=re.sub(userPattern," USER",tweet)
        tweet=re.sub(alphaPattern," ",tweet)
        tweet=re.sub(sequencePattern,seqReplacePattern,tweet)


        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, emojis[emoji])
        
        processed_tweet.append(tweet)
    return processed_tweet       
        

        
    

In [None]:
preprocessed_text=preprocess(tweets_list)

In [None]:
preprocessed_text[0:10]

In [31]:
df["processed_tweets"]=preprocessed_text
df.head()

Unnamed: 0,twitts,sentiment,processed_tweets
0,@robbiebronniman Sounds like a great night.,1,USER sounds like great night
1,Damn the person who stolde my wallet !!!!! Ma...,1,damn person stolde wallet may karma come back ...
2,Greetings from the piano bench (photo) http:/...,1,greetings piano bench photo URL
3,@drewryanscott i love it!! i love you!! haha f...,1,USER love it love you haha forget hug you giv...
4,"@kissthestars Pretty pretty pretty please, pak...",0,USER pretty pretty pretty please pakidownloa...


In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["processed_tweets"],df["sentiment"], 
                                                    test_size = 0.20, random_state = 0)
print(f'Data Split done.')

Data Split done.


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
tfidf.fit(X_train)
X_train=tfidf.transform(X_train)
X_test=tfidf.transform(X_test)

In [36]:
X_train

<24000x21357 sparse matrix of type '<class 'numpy.float64'>'
	with 190672 stored elements in Compressed Sparse Row format>

In [37]:
X_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
from sklearn.metrics import accuracy_score
def build_model(models):
    trained_models=[]
    for model in models:
        model.fit(X_train,y_train)
        # Predict values for Test dataset
        y_pred = model.predict(X_test)

        # Print the evaluation metrics for the dataset.
        print(model, accuracy_score(y_test,y_pred))

        trained_models.append(model)
    return trained_models

    

In [48]:

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
models=[RandomForestClassifier(n_estimators=10)]

In [49]:
trained_models=build_model(models)

RandomForestClassifier(n_estimators=10) 0.711


In [44]:
import pickle

file = open('../models/tfidf.pkl','wb')
pickle.dump(tfidf, file)
file.close()

file = open('../models/lr_model.pkl','wb')
pickle.dump(trained_models[1], file)
file.close()

In [50]:
file = open('../models/rf_model.pkl','wb')
pickle.dump(trained_models[0], file)
file.close()

In [51]:
X_test

<6000x21357 sparse matrix of type '<class 'numpy.float64'>'
	with 45122 stored elements in Compressed Sparse Row format>

<100x21357 sparse matrix of type '<class 'numpy.float64'>'
	with 741 stored elements in Compressed Sparse Row format>