## ADM:  P52/38304/2020   NAME:ONYANGO MARGARET NYAKENO 


In [1]:
#Import all required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import nltk 
import string
import re
%matplotlib inline

In [2]:
#import using pandas 
tweets = pd.read_csv('Tweets/TweetRaw.csv', sep=',',
                           names=["date", "tweet"])

In [3]:
tweets.head()

Unnamed: 0,date,tweet
0,;2015-08-03 15:35;0;0;,@ComedyCentralKE These words can kill a Luhya ...
1,;2014-07-11 23:29;0;0;,The little luhya that remains in me always ...
2,;2014-02-07 18:36;0;2;,@cheernatwildcat kill it at battle this weeken...
3,;2011-10-09 19:34;0;0;,@HomeboyzRadio H.B.R luv dat luhya hit luhyas ...
4,;2015-08-21 09:27;2;3;,#HangOutFriday hahaha ball ya terby( derby) L...


#once we have the data, we can begin to clean it
#we will work with functions to help us clean the data


## Replacing non-ASCII characters with spaces


In [4]:
#function 

def remove_nonascii(text):
    
    return ''.join([i if ord(i) < 128 else ' ' for i in text])

In [5]:
non_ascii_tweets=tweets['tweet'].apply(remove_nonascii)

In [6]:
non_ascii_tweets.head()

0    @ComedyCentralKE These words can kill a Luhya ...
1    The little luhya that remains in me always    ...
2    @cheernatwildcat kill it at battle this weeken...
3    @HomeboyzRadio H.B.R luv dat luhya hit luhyas ...
4    #HangOutFriday hahaha ball ya terby( derby)  L...
Name: tweet, dtype: object

In [7]:
#put it back to a dataframe

non_ascii_tweets = pd.DataFrame(non_ascii_tweets,columns=['tweet'])
non_ascii_tweets['date'] = tweets.date

non_ascii_tweets.head()


Unnamed: 0,tweet,date
0,@ComedyCentralKE These words can kill a Luhya ...,;2015-08-03 15:35;0;0;
1,The little luhya that remains in me always ...,;2014-07-11 23:29;0;0;
2,@cheernatwildcat kill it at battle this weeken...,;2014-02-07 18:36;0;2;
3,@HomeboyzRadio H.B.R luv dat luhya hit luhyas ...,;2011-10-09 19:34;0;0;
4,#HangOutFriday hahaha ball ya terby( derby) L...,;2015-08-21 09:27;2;3;


In [8]:
#example of a character removed

print(tweets['tweet'].iloc[4247])

print(non_ascii_tweets['tweet'].iloc[4247])


(뭣보다 귀찮고 ��� 힘든 체�스 �버트  14세. ADHD.)
(                        14 . ADHD.)


## Remove emoticons

In [9]:
#function to remove emoticons


def remove_emojis(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


In [10]:
no_emoticons=non_ascii_tweets['tweet'].apply(remove_emojis)

In [11]:
no_emoticons.head()

0    @ComedyCentralKE These words can kill a Luhya ...
1    The little luhya that remains in me always    ...
2    @cheernatwildcat kill it at battle this weeken...
3    @HomeboyzRadio H.B.R luv dat luhya hit luhyas ...
4    #HangOutFriday hahaha ball ya terby( derby)  L...
Name: tweet, dtype: object

In [15]:
#put it back to a dataframe

no_emoticons = pd.DataFrame(no_emoticons,columns=['tweet'])
no_emoticons['date'] = tweets.date

no_emoticons.head()

Unnamed: 0,tweet,date
0,@ComedyCentralKE These words can kill a Luhya ...,;2015-08-03 15:35;0;0;
1,The little luhya that remains in me always ...,;2014-07-11 23:29;0;0;
2,@cheernatwildcat kill it at battle this weeken...,;2014-02-07 18:36;0;2;
3,@HomeboyzRadio H.B.R luv dat luhya hit luhyas ...,;2011-10-09 19:34;0;0;
4,#HangOutFriday hahaha ball ya terby( derby) L...,;2015-08-21 09:27;2;3;


## Remove punctuation,stopwords & lowercase)

In [16]:
from nltk.corpus import stopwords


In [17]:
# we are going to do this in one function

def text_preprocess(mess):
    """
    Takes in a string of text, then performs the following:

    1. Remove all punctuation
    2. Lowercase all the values
    3. Remove all stopwords
    4. Returns a list of the cleaned tweets
    
    
    """
    # Check characters to see if they are in punctuation and remove
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    #lowercase
    nopunc=nopunc.lower()
        
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [20]:
process_tweets=no_emoticons['tweet'].apply(text_preprocess)

In [21]:
process_tweets.head(10)

0    [comedycentralke, words, kill, luhya, wacha, u...
1    [little, luhya, remains, always, sitokingi, ki...
2    [cheernatwildcat, kill, battle, weekend, wildc...
3    [homeboyzradio, hbr, luv, dat, luhya, hit, luh...
4    [hangoutfriday, hahaha, ball, ya, terby, derby...
5    [luos, kill, blood, thirsty, killers, two, tri...
6    [police, kill, luos, bondo, shot, grannies, mi...
7    [theres, provision, police, kill, innocent, un...
8    [gvnt, us, determined, kill, luos, young, old,...
9    [today, eventsscok, maraga, adventist, uphold,...
Name: tweet, dtype: object

In [22]:
#put it back to a dataframe

cleaned_tweets = pd.DataFrame(process_tweets,columns=['tweet'])
cleaned_tweets['date'] = tweets.date

cleaned_tweets.head()

Unnamed: 0,tweet,date
0,"[comedycentralke, words, kill, luhya, wacha, u...",;2015-08-03 15:35;0;0;
1,"[little, luhya, remains, always, sitokingi, ki...",;2014-07-11 23:29;0;0;
2,"[cheernatwildcat, kill, battle, weekend, wildc...",;2014-02-07 18:36;0;2;
3,"[homeboyzradio, hbr, luv, dat, luhya, hit, luh...",;2011-10-09 19:34;0;0;
4,"[hangoutfriday, hahaha, ball, ya, terby, derby...",;2015-08-21 09:27;2;3;


In [23]:
#example of cleaned tweets

print(tweets['tweet'].iloc[4931])

print(cleaned_tweets['tweet'].iloc[4931])


@odera_sir Lmao! oh yeah! indeed! luos are very united with that :D
['oderasir', 'lmao', 'oh', 'yeah', 'indeed', 'luos', 'united']


In [24]:
cleaned_tweets

Unnamed: 0,tweet,date
0,"[comedycentralke, words, kill, luhya, wacha, u...",;2015-08-03 15:35;0;0;
1,"[little, luhya, remains, always, sitokingi, ki...",;2014-07-11 23:29;0;0;
2,"[cheernatwildcat, kill, battle, weekend, wildc...",;2014-02-07 18:36;0;2;
3,"[homeboyzradio, hbr, luv, dat, luhya, hit, luh...",;2011-10-09 19:34;0;0;
4,"[hangoutfriday, hahaha, ball, ya, terby, derby...",;2015-08-21 09:27;2;3;
...,...,...
20016,"[symokuraya, lol, yah, prices, exorbitant, hao...",;2012-04-17 17:20;0;0;
20017,"[jalangomwenyewe, hao, wakisii, wafunguliwe, r...",;2014-02-11 06:37;0;1;
20018,"[stop, harbouring, hatred, hate, one, another,...",;2017-11-02 14:34;0;0;
20019,"[seem, hate, kikuyus, n, nickname, kikuyu, nam...",;2017-10-27 06:51;0;0;


##The tweets are now cleaned and stored in a dataframe clean_tweets ready for anlysis

##                                        THE END