
# Complete preprocessing Pipeline

## Mount drive and Install other required libraries



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install wordsegment #for hashtag segmentation
!pip install demoji      # for replacing emogis with repective words
!pip install spacy_langdetect # for language detection
!pip install emoji # for preprocessing emogis

## Download deemogi dictionary
import demoji 
demoji.download_codes()

Downloading emoji data ...
... OK (Got response in 0.17 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [3]:
#Basic Python and Machine learning libraries
import os, sys, warnings, random, time, re, math, string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from string import punctuation
from collections import Counter
import seaborn as sns
from scipy import stats
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from IPython.display import display
from tqdm import tqdm_notebook

#Pytorch and nltk and other text relevant libraries
import torch
import nltk, emoji
import demoji
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.functional import F 
from torch.utils.data import Dataset, DataLoader


from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

from bs4 import BeautifulSoup
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize.casual import casual_tokenize
from nltk.util import ngrams

import spacy
from spacy_langdetect import LanguageDetector


#Mandatory Lines of code when working with jupyter notebooks
%matplotlib inline
warnings.filterwarnings('ignore')

#tqdm with pandas
from tqdm import tqdm
tqdm.pandas()

from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# sklearn data science models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, Lasso
from sklearn.svm import LinearSVC
import xgboost as xgb

# add the path of ekphrasis- the hashtag segmentation library
sys.path.insert(1, "/content/drive/MyDrive/inter-iit-bridgei2i/ekphrasis")

In [4]:
plt.style.use('seaborn-whitegrid')
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['text.color'] = 'black'
plt.rcParams['axes.labelcolor']= 'black'
plt.rcParams['xtick.color'] = 'black'
plt.rcParams['ytick.color'] = 'black'
plt.rcParams['font.size'] = 12

## Load data

In [5]:
data = pd.read_excel('/content/drive/MyDrive/inter-iit-bridgei2i/Development Data/dev_data_tweet.xlsx')
# data = pd.read_excel('/content/drive/MyDrive/Datasets/IIT tech meet 2021_BRIDGEi2i_NLP/Development Data/dev_data_tweet.xlsx')
data.head()

Unnamed: 0,Tweet_ID,Tweet,Mobile_Tech_Tag
0,tweet_0001,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0
1,tweet_0002,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0
2,tweet_0003,RT @stealyoman_cuso: really says a lot about s...,0
3,tweet_0004,RT @PGtzsche1: HPV vaccines increased serious ...,0
4,tweet_0005,Ramaphosa says if you are positive you must se...,0


In [6]:
# drop the useless Tweet_ID
data = data.drop("Tweet_ID", axis=1)
data.head()

Unnamed: 0,Tweet,Mobile_Tech_Tag
0,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0
1,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0
2,RT @stealyoman_cuso: really says a lot about s...,0
3,RT @PGtzsche1: HPV vaccines increased serious ...,0
4,Ramaphosa says if you are positive you must se...,0


In [7]:
data.describe()

Unnamed: 0,Mobile_Tech_Tag
count,4000.0
mean,0.25
std,0.433067
min,0.0
25%,0.0
50%,0.0
75%,0.25
max,1.0


## Replace Emojis with texts



In [8]:
%%time
data['cleaned_tweet_text_emoji'] = data.Tweet.map(lambda x: demoji.replace_with_desc(x, sep = " "))

CPU times: user 7.87 s, sys: 6.13 ms, total: 7.88 s
Wall time: 7.9 s


In [9]:
data.head()

Unnamed: 0,Tweet,Mobile_Tech_Tag,cleaned_tweet_text_emoji
0,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0,You'll purple heart my #PitchWars book if yo...
1,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0,RT @SkySportsNews: police car light Breaking...
2,RT @stealyoman_cuso: really says a lot about s...,0,RT @stealyoman_cuso: really says a lot about s...
3,RT @PGtzsche1: HPV vaccines increased serious ...,0,RT @PGtzsche1: HPV vaccines increased serious ...
4,Ramaphosa says if you are positive you must se...,0,Ramaphosa says if you are positive you must se...


## Another coloumn with removed emoji

In [10]:
%%time
data['cleaned_tweet_no_emoji'] = data.Tweet.map(lambda x: demoji.replace(x, " "))

CPU times: user 7.23 s, sys: 3.76 ms, total: 7.23 s
Wall time: 7.24 s


In [11]:
data.head()

Unnamed: 0,Tweet,Mobile_Tech_Tag,cleaned_tweet_text_emoji,cleaned_tweet_no_emoji
0,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0,You'll purple heart my #PitchWars book if yo...,You'll my #PitchWars book if you like: hat...
1,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0,RT @SkySportsNews: police car light Breaking...,RT @SkySportsNews: Breaking: #WBA have reach...
2,RT @stealyoman_cuso: really says a lot about s...,0,RT @stealyoman_cuso: really says a lot about s...,RT @stealyoman_cuso: really says a lot about s...
3,RT @PGtzsche1: HPV vaccines increased serious ...,0,RT @PGtzsche1: HPV vaccines increased serious ...,RT @PGtzsche1: HPV vaccines increased serious ...
4,Ramaphosa says if you are positive you must se...,0,Ramaphosa says if you are positive you must se...,Ramaphosa says if you are positive you must se...


## Hashtag segmentation - a column with only hashtag

In [12]:
%%time
data['hashtag_list'] = data.Tweet.map(lambda x: re.findall(r"#(\w+)", x))

CPU times: user 13.4 ms, sys: 1.04 ms, total: 14.5 ms
Wall time: 15.9 ms


In [13]:
data.head()

Unnamed: 0,Tweet,Mobile_Tech_Tag,cleaned_tweet_text_emoji,cleaned_tweet_no_emoji,hashtag_list
0,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0,You'll purple heart my #PitchWars book if yo...,You'll my #PitchWars book if you like: hat...,"[PitchWars, OwnVoices, PWParty20]"
1,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0,RT @SkySportsNews: police car light Breaking...,RT @SkySportsNews: Breaking: #WBA have reach...,"[WBA, AFC, DeadlineDay]"
2,RT @stealyoman_cuso: really says a lot about s...,0,RT @stealyoman_cuso: really says a lot about s...,RT @stealyoman_cuso: really says a lot about s...,[]
3,RT @PGtzsche1: HPV vaccines increased serious ...,0,RT @PGtzsche1: HPV vaccines increased serious ...,RT @PGtzsche1: HPV vaccines increased serious ...,[]
4,Ramaphosa says if you are positive you must se...,0,Ramaphosa says if you are positive you must se...,Ramaphosa says if you are positive you must se...,[FamilyMeeting]


## Use ekphrasis for the segmentation of complex words in hashtag

In [14]:
## TODO: solve the problem of Iphone => I Phone
## ___________________________________________________________

## Json (/content/drive/MyDrive/inter-iit-bridgei2i/ekphrasis/ekphrasis/dicts/brand_models.json)
## file is already created containing all the 
## brand and model name but still problem remains same 
## to create again run /content/drive/MyDrive/inter-iit-bridgei2i/create_json.py
## (might have some bug) 
## _____________________________________________________________________________________


# from ekphrasis.classes.segmenter import Segmenter

# def hashtag_seg(tweet):
#   # segmenter using the word statistics from Twitter
#   seg_tw = Segmenter(corpus="twitter",)

#   tweet_lst = tweet.split(' ')
#   new_tweet_lst = []
#   for word in tweet_lst:
#     if word[0] == '#':
#       word = re.sub(r'#','', word)
#       word = seg_tw.segment(word)
    
#     new_tweet_lst.append(word)
  
#   tweet = ' '.join(new_tweet_lst)
#   return tweet

# text = """RT #Iphone #Apple @SkySportsNews: 🚨 Breaking: #WBA have reached a verbal agreement with #AFC in the last hour over a #iPhone 12 loan deal for Ainsley Maitland-Niles. @RobDorsettSky adds he will undergo a medical in London #IPhone in the next couple of hours. #DeadlineDay 🤝"""
# k = hashtag_seg(text)
# print(k)


## Remove URL, @ and hash, RT, split camel case

In [15]:
def remove_urls(text):
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)

def clean_reserved(text):
  text = text.replace('@', ' ').replace('#',' ').replace('_',' ').replace('RT', ' ')
  return text
  
def camel_case_split(identifier):
  matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
  return " ".join([m.group(0) for m in matches])
# text = """RT @SkySportsNews:  police car light  Breaking: #WBA have reached a verbal agreement with #AFC in the last hour over a loan deal for Ainsley Maitland-Niles. @RobDorsettSky adds he will undergo a medical in London in the next couple of hours. #DeadlineDay  handshake """
# remove_urls(text)

In [16]:
%%time
for i,(tw1, tw2) in enumerate(zip(data['cleaned_tweet_text_emoji'], data['cleaned_tweet_no_emoji'])):
  tw1 = remove_urls(tw1)
  tw1 = clean_reserved(tw1)
  tw1 = camel_case_split(tw1)
  tw1 = tw1.lower() # lower case

  tw2 = remove_urls(tw2)
  tw2 = clean_reserved(tw2)
  tw2 = camel_case_split(tw2)
  tw2 = tw2.lower() # lower case

  data['cleaned_tweet_text_emoji'][i] = tw1
  data['cleaned_tweet_no_emoji'][i] = tw2

CPU times: user 1.06 s, sys: 11 ms, total: 1.07 s
Wall time: 1.08 s


In [17]:
data.head()

Unnamed: 0,Tweet,Mobile_Tech_Tag,cleaned_tweet_text_emoji,cleaned_tweet_no_emoji,hashtag_list
0,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0,you'll purple heart my pitch wars book if y...,you'll my pitch wars book if you like: ha...,"[PitchWars, OwnVoices, PWParty20]"
1,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0,sky sports news: police car light breakin...,sky sports news: breaking: wba have reac...,"[WBA, AFC, DeadlineDay]"
2,RT @stealyoman_cuso: really says a lot about s...,0,stealyoman cuso: really says a lot about so...,stealyoman cuso: really says a lot about so...,[]
3,RT @PGtzsche1: HPV vaccines increased serious ...,0,p gtzsche1: hpv vaccines increased serious ...,p gtzsche1: hpv vaccines increased serious ...,[]
4,Ramaphosa says if you are positive you must se...,0,ramaphosa says if you are positive you must se...,ramaphosa says if you are positive you must se...,[FamilyMeeting]


In [18]:
data['cleaned_tweet_text_emoji'][2]

'   stealyoman cuso: really says a lot about society that we are half way thru a snow storm and not a single man has offered to risk his life to eat my ass by candle light'

## Contraction Mapping and punctuation removal

In [19]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                       "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                       "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                       "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s':'america', 'e.g':'for example', 'smartphones':'smartphone'}

punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
                 "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':' '}

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does','Etherium': 'Ethereum', 
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization',
                'demonetisation': 'demonetization'}

In [20]:
def clean_contractions(text):
    '''Clean contraction using contraction mapping'''    
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")

    for word in contraction_mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+contraction_mapping[word]+"")

    for p in punct_mapping:
      text = text.replace(p, punct_mapping[p])

    for p in punct:
      text = text.replace(p,' ')

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy.He is good" => "he is a boy .He is good" so the it doesn't become 
    # he is a boyHe is good
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    #Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Remove extra space
    text = re.sub(r' +', ' ', text)

    return text
# text = """they're the stealyoman cuso::::::::: really says a lot ((((((((((((((((((((((( about society that we are half way thru```````````````````` a snow storm and not a ????????????????????????????????? &&&&&&&&&&&&&&&&&& single man has offered to risk his life to eat my ass by candle light"""
# clean_contractions(text)

In [21]:
%%time
for i,(tw1, tw2) in enumerate(zip(data['cleaned_tweet_text_emoji'], data['cleaned_tweet_no_emoji'])):
  tw1 = clean_contractions(tw1)
  tw2 = clean_contractions(tw2)
  data['cleaned_tweet_text_emoji'][i] = tw1
  data['cleaned_tweet_no_emoji'][i] = tw2

CPU times: user 1.8 s, sys: 5.02 ms, total: 1.81 s
Wall time: 1.82 s


In [22]:
data.head()

Unnamed: 0,Tweet,Mobile_Tech_Tag,cleaned_tweet_text_emoji,cleaned_tweet_no_emoji,hashtag_list
0,You'll 💜 my #PitchWars book if you like: 🦋 hat...,0,you will purple heart my pitch wars book if yo...,you will my pitch wars book if you like hate t...,"[PitchWars, OwnVoices, PWParty20]"
1,RT @SkySportsNews: 🚨 Breaking: #WBA have reach...,0,sky sports news police car light breaking wba...,sky sports news breaking wba have reached a v...,"[WBA, AFC, DeadlineDay]"
2,RT @stealyoman_cuso: really says a lot about s...,0,stealyoman cuso really says a lot about socie...,stealyoman cuso really says a lot about socie...,[]
3,RT @PGtzsche1: HPV vaccines increased serious ...,0,p gtzsche1 hpv vaccines increased serious ner...,p gtzsche1 hpv vaccines increased serious ner...,[]
4,Ramaphosa says if you are positive you must se...,0,ramaphosa says if you are positive you must se...,ramaphosa says if you are positive you must se...,[FamilyMeeting]


## Remove duplicates

In [23]:
train_test_df = data[['cleaned_tweet_text_emoji', 'Mobile_Tech_Tag']].copy()
duplicate = train_test_df[train_test_df.duplicated()]
len(duplicate)

1709

In [24]:
unique_df = train_test_df.drop_duplicates(keep = 'last')
len(unique_df)

2291

# Tokenization, more cleaning, train-test split and baseline

## Tokenization