Issues to solve
* Far too many duplicates being collected in the dataset
* Handle mispelled and redundant words
* ~~removing mentions and hashtag~~
* ~~removing links, special characters, punctuation marks~~
* ~~stopwords removal~~
* ~~contractions handling~~
* ~~stemming and lemmatization~~ did only lemmatization

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import fnmatch
import string
from urllib.parse import urlparse
import contractions

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer
from nltk.corpus import words, stopwords
from nltk.metrics.distance import jaccard_distance, edit_distance
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/nitanshjain/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [108]:
tweets_df = pd.read_csv('/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/data/20221026_235416_tweets.csv')
tweets_df.head()

Unnamed: 0,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions
0,100673500.0,1.5853e+18,sai1951,,396,81,2022-10-26 16:04:20+00:00,8226,0,@DHFWKA @PMOIndia @MoHFW_INDIA @CMofKarnataka ...,"[{'text': 'Covid_19', 'indices': [392, 401]}, ...","[{'screen_name': 'DHFWKA', 'name': ""K'taka Hea..."
1,101430900.0,1.58526e+18,shivaramsingh,Cuttack,777,1242,2022-10-26 13:00:46+00:00,5578,0,@nipun29j @drsuniltaneja @doc_arka @docMPK @sa...,[],"[{'screen_name': 'nipun29j', 'name': 'nipun ve..."
2,1550427000.0,1.58525e+18,rhltiwari711,"Lucknow, India",2573,520,2022-10-26 12:49:43+00:00,4641,0,"Dear @TwitterIndia, kindly provide @verified t...","[{'text': 'VC', 'indices': [74, 77]}, {'text':...","[{'screen_name': 'TwitterIndia', 'name': 'Twit..."
3,190904500.0,1.58523e+18,amishradp,"दिल्ली, भारत",51,9,2022-10-26 11:26:36+00:00,243,0,@care_mediassist @royalsundaram @naveen_shahi1...,[],"[{'screen_name': 'care_mediassist', 'name': 'S..."
4,350945100.0,1.58523e+18,chouhanneeraj07,"Mumbai, India",879,488,2022-10-26 11:23:47+00:00,3450,0,@drchetandeshmu1 @aparanjape @JM_Scindia @mans...,[],"[{'screen_name': 'drchetandeshmu1', 'name': 'd..."


In [109]:
tweets_df.shape

(1293, 12)

In [110]:
print('Shape of dataset before removal of duplicates is {}'.format(tweets_df.shape))
tweets_df.drop_duplicates(subset=['tweet_id'], inplace=True)
print('Shape of dataset after removal of duplicates is {}'.format(tweets_df.shape))

Shape of dataset before removal of duplicates is (1293, 12)
Shape of dataset after removal of duplicates is (139, 12)


In [111]:
tweets_df.dtypes

user_id           float64
tweet_id          float64
username           object
location           object
following           int64
followers           int64
twt_created_at     object
total_tweets        int64
retweet_count       int64
text               object
hashtags           object
mentions           object
dtype: object

In [112]:
def data_preprocessing(df):
    count=0
    """
    One function to rule them all, 
    one function to find them, 
    One function to bring them all, 
    and in the darkness bind them; 
    """
    print('Shape of dataset before removal of tweets with less than 5 words is {}'.format(df.shape))
    
    for tweets in df.loc[:,'text']:
        # count+=1
        # print(tweets)
        tokenizer = TweetTokenizer()
        tweet_id = df.loc[df['text'] == tweets, 'tweet_id'].values[0] 
        
        # removing links
        list_words = tokenizer.tokenize(tweets)
        x = [word for word in list_words if not urlparse(word).scheme]
        tweets = ' '.join(x)

        # contractions handling
        list_words = tokenizer.tokenize(tweets)
        new_list_words = []
        for word in list_words:
            new_list_words.append(contractions.fix(word))
        list_words = new_list_words
        del(new_list_words)
        tweets = ' '.join(list_words)
        
        # adding space between words and punctuations
        tweets = tweets.replace(',', ' ,').replace('.', ' .').replace('?', ' ?').replace('!', ' !')
        
        # removing hashtags and mentions
        list_words = tokenizer.tokenize(tweets)
        list_words = [word for word in list_words if word[0] not in ('#', '@')]
        list_words = [word for word in list_words if word[0] not in ('▪')]
        tweets = ' '.join(list_words)
        
        # removing punctuations
        tweets = tweets.translate(str.maketrans('', '', string.punctuation))
        
        #removing emojis
        tweets = re.sub(r'[^\x00-\x7F]+', ' ', tweets)
        
        #lower case
        tweets = tweets.lower()
        
        #remove numbers
        tweets = re.sub(r'\d+', '', tweets)
        tweets = re.sub(' +', ' ', tweets)
        
        #removing stopwords
        list_words = tokenizer.tokenize(tweets)
        filtered_words = [word for word in list_words if word not in stopwords.words('english')]
        tweets = ' '.join(filtered_words)
        del(filtered_words)
        
        #lemmatization
        lem = WordNetLemmatizer()
        list_words = word_tokenize(tweets)
        for word in list_words:
            list_words = list(map(lambda x: x.replace(word, lem.lemmatize(word)), list_words))
        tweets = ' '.join(list_words)
        
        #removing individual letters
        list_words = tokenizer.tokenize(tweets)
        filtered_words = [word for word in list_words if len(word)>2]
        tweets = ' '.join(filtered_words)
        del(filtered_words)
        
        # updating tweets in dataframe
        df.loc[df['tweet_id']==tweet_id, 'text'] = tweets
        
        #remove small tweets
        list_words = tokenizer.tokenize(tweets)
        if len(list_words) <= 5:
            ind_num = df[df['tweet_id']==tweet_id].index
            df.drop(ind_num, inplace=True)
        # break
    print('Shape of dataset after removal of tweets with less than 5 words is {}'.format(df.shape))
    
    return df
        
        
        

In [113]:
tweets_df = data_preprocessing(tweets_df)

Shape of dataset before removal of tweets with less than 5 words is (139, 12)
Shape of dataset after removal of tweets with less than 5 words is (106, 12)


In [116]:
def correcting_words(df):
    tokenizer = TweetTokenizer()
    for tweets in df.loc[:, 'text']:
        list_words = tokenizer.tokenize(tweets)
        correct_words = []
        for word in list_words:
            temp = [(edit_distance(word, w),w) for w in words.words() if w[0]==word[0] and w[len(w)-1]==word[len(word)-1]]
            correct_words.append(sorted(temp, key = lambda val:val[0])[0][1])
        print(list_words)
        print(correct_words)
    
    # for i in range(len(df)):
    #     words_list = df.loc[i, ('text')].split()
    #     for word in words_list:
    #         if word not in words.words():
    #             # print(word)
    #             for w in words.words():
    #                 if jaccard_distance(set(ngrams(word, n=3)), set(ngrams(w, n=3))) < 0.5:
    #                     # print(w)
    #                     ind = words_list.index(word)
    #                     words_list[ind] = w
    #                     break
    #     df.loc[i, ('text')] = ' '.join(words_list)
        
    #     return df
    