In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk 
from nltk.tokenize import word_tokenize
import emoji
from emoji.unicode_codes import UNICODE_EMOJI

In [2]:
#loading the data into the dataframe
data=pd.read_csv('D:\\Manisha\\Undergrad_Semester\\Semesters\\Fall2019\\capstone\\System.tweets_esports_olympics.csv')

In [3]:
#droping all columns except the tweets(text)
data.drop(['_id','created_at','favorite_count','geo','place','id','retweet_count','source'],axis=1,inplace=True)

In [4]:
#check first five rows
data.head()

Unnamed: 0,text
0,RT @IntelGaming: The best @RocketLeague player...
1,RT @btvesports: 🏆 L'equip Special Olympics Cat...
2,RT @IntelGaming: The best @StreetFighter playe...
3,RT @IntelGaming: The best @RocketLeague player...
4,https://t.co/wBIbIBMLkc


In [5]:
#checking the dimension of the data, number of rows and columns
data.shape

(500, 1)

In [6]:
#Removing the duplicate rows from text column
data=data.drop_duplicates(['text'],keep='first')

In [7]:
#Checking the shape of the data after removing the duplicates
data.shape

(252, 1)

In [8]:
#Droping the old index and creating the new index
data=data.reset_index(drop=True)

In [9]:
data.head()

Unnamed: 0,text
0,RT @IntelGaming: The best @RocketLeague player...
1,RT @btvesports: 🏆 L'equip Special Olympics Cat...
2,RT @IntelGaming: The best @StreetFighter playe...
3,https://t.co/wBIbIBMLkc
4,The team-up of The International Olympic Commi...


In [10]:
#converting emoji into the text
import emoji
for i in range(len(data)):
    data.loc[i,'text'] = emoji.demojize(data.loc[i,'text'])

In [11]:
data.head()

Unnamed: 0,text
0,RT @IntelGaming: The best @RocketLeague player...
1,RT @btvesports: :trophy: L'equip Special Olymp...
2,RT @IntelGaming: The best @StreetFighter playe...
3,https://t.co/wBIbIBMLkc
4,The team-up of The International Olympic Commi...


In [12]:
#data cleaning steps
for i in range(len(data)):
    # Remove the word starting with @
    data.loc[i,'text'] = re.sub(r'@[A-Za-z0-9]+', ' ', str(data.loc[i,'text']))
  
    #Remove URL links
    data.loc[i,'text']  = re.sub('https?://[A-Za-z0-9./]+',' ',data.loc[i,'text'] ) 
    data.loc[i,'text']  = re.sub('http?://[A-Za-z0-9./]+',' ',data.loc[i,'text'] )

    # Converting to Lowercase
    data.loc[i,'text']  = data.loc[i,'text'] .lower()
        
    #Remove the new line characters
    data.loc[i,'text'] = re.sub(r"\t|\n|\r", " ", data.loc[i,'text'] , flags=re.I)
    
    #Remove punctuation
    data.loc[i,'text'] = re.sub(r"[,@\#:'?\.$%_!()]", " ", data.loc[i,'text'] , flags=re.I)
    
    #Remove digits
    data.loc[i,'text'] = re.sub(r"\d", "", data.loc[i,'text'] )
     
    # remove all single characters
    data.loc[i,'text'] = re.sub(r'\s+[a-zA-Z]\s+', ' ', data.loc[i,'text'] )

    # Substituting multiple spaces with single space
    data.loc[i,'text']  = re.sub(r'\s+', ' ', data.loc[i,'text'] , flags=re.I)
    

In [13]:
data.head()

Unnamed: 0,text
0,rt the best players will go head to head for t...
1,rt trophy equip special olympics catalunya ha ...
2,rt the best players will go head to head for t...
3,
4,the team-up of the international olympic commi...


In [14]:
## split into words, create tokens
from nltk.tokenize import word_tokenize
for i in range(len(data)):
    data.loc[i,'text'] = word_tokenize(str(data.loc[i,'text']))

In [15]:
data.head()

Unnamed: 0,text
0,"[rt, the, best, players, will, go, head, to, h..."
1,"[rt, trophy, equip, special, olympics, catalun..."
2,"[rt, the, best, players, will, go, head, to, h..."
3,[]
4,"[the, team-up, of, the, international, olympic..."


In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
for i in range(len(data)):
    data.loc[i,'text'] = [word for word in data.loc[i,'text'] if not word in stop_words]

In [17]:
data.head()

Unnamed: 0,text
0,"[rt, best, players, go, head, head, whole, wor..."
1,"[rt, trophy, equip, special, olympics, catalun..."
2,"[rt, best, players, go, head, head, whole, wor..."
3,[]
4,"[team-up, international, olympic, committee, i..."


In [18]:
# Lemmetization of words (it will change the word in the base form)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
for i in range(len(data)):
    data.loc[i,'text'] = [lemmatizer.lemmatize(word,pos='a') for word in data.loc[i,'text']]
    data.loc[i,'text'] = [lemmatizer.lemmatize(word,pos='v') for word in data.loc[i,'text']]
    data.loc[i,'text'] = [lemmatizer.lemmatize(word,pos='n') for word in data.loc[i,'text']] 

In [19]:
data.head()

Unnamed: 0,text
0,"[rt, best, player, go, head, head, whole, worl..."
1,"[rt, trophy, equip, special, olympics, catalun..."
2,"[rt, best, player, go, head, head, whole, worl..."
3,[]
4,"[team-up, international, olympic, committee, i..."


In [20]:
# remove remaining tokens that are not alphabetic
for i in range(len(data)):
    data.loc[i,'text'] = [word for word in data.loc[i,'text'] if word.isalpha()]

In [21]:
data.head()

Unnamed: 0,text
0,"[rt, best, player, go, head, head, whole, worl..."
1,"[rt, trophy, equip, special, olympics, catalun..."
2,"[rt, best, player, go, head, head, whole, worl..."
3,[]
4,"[international, olympic, committee, intel, wor..."


In [22]:
#Checking the words are english word or not, if not then remove it.
from nltk.corpus import words
#nltk.download('words')

for i in range(len(data)):
    data.loc[i,'text'] = [word for word in data.loc[i,'text'] if word in words.words()]

In [23]:
data.head()

Unnamed: 0,text
0,"[best, player, go, head, head, whole, world, s..."
1,"[trophy, equip, special, ha, de, la, de]"
2,"[best, player, go, head, head, whole, world, s..."
3,[]
4,"[international, committee, sponsor, announce, ..."


In [24]:
#Removing empty rows from dataframe and doing re-indexing
for i in data.index:
    if len(data.loc[i,'text'])==0:
        data.drop(index=i, inplace=True)
data=data.reset_index(drop=True)       


In [25]:
data.head()

Unnamed: 0,text
0,"[best, player, go, head, head, whole, world, s..."
1,"[trophy, equip, special, ha, de, la, de]"
2,"[best, player, go, head, head, whole, world, s..."
3,"[international, committee, sponsor, announce, ..."
4,"[place, get, outside, get, fresh, air, game]"


In [None]:
data.to_csv('D:\\Manisha\\Undergrad_Semester\\Semesters\\Fall2019\\capstone\\System.tweets_esports_olympics_Cleaned_Dataframe.csv', index=False)