In [2]:
import pandas as pd
from DataSchemaExtractionParsing import *

## Sample Data Extraction & Parsing:

In [5]:
sample_tweets = extract_tweets('../../Data/Sample Data/sample.tsv.xlsx')
#sample_tweets.head(1)

In [3]:
sample_tweets.shape

(9763, 20)

In [4]:
sample_tweets.columns

Index([u'id', u'userId', u'createdAt', u'text', u'longitude', u'latitude',
       u'placeId', u'inReplyTo', u'source', u'truncated', u'placeLatitude',
       u'placeLongitude', u'sourceName', u'sourceUrl', u'userName',
       u'screenName', u'followersCount', u'friendsCount', u'statusesCount',
       u'userLocation'],
      dtype='object')

In [5]:
# Keeping Meaningful columns
sample_tweets = sample_tweets[['createdAt', 'text', 'longitude', 'latitude', 'placeLongitude', 'placeLatitude', 'userLocation']]
sample_tweets.head()

Unnamed: 0,createdAt,text,longitude,latitude,placeLongitude,placeLatitude,userLocation
0,2016-09-15 20:48:01,se lo dici tu... https://t.co/x7Qm1VHBKL,\N,\N,8.96044,46.0027,Earleen.
1,2016-09-15 20:48:05,https://t.co/noYrTnqmg9,\N,\N,8.22414,46.8131,Suisse
2,2016-09-15 20:48:15,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,\N,\N,5.94082,47.201,Fontain
3,2016-09-15 20:48:18,@Mno0or_Abyat اشوف مظاهرات على قانون العمل الج...,\N,\N,6.16552,45.8011,Shargeyah
4,2016-09-15 20:48:18,Greek night #geneve (@ Emilios in Genève) http...,6.14414,46.1966,6.14319,46.2048,İstanbul/Burgazada


## Data Cleaning:

In [6]:
sample_tweets.isnull().any()

createdAt         True
text              True
longitude         True
latitude          True
placeLongitude    True
placeLatitude     True
userLocation      True
dtype: bool

In [7]:
sample_tweets_cleaned = clean_tweets(sample_tweets)
sample_tweets_cleaned.head()

Unnamed: 0,createdAt,text,longitude,latitude,placeLongitude,placeLatitude,userLocation
0,2016-09-15 20:48:01,se lo dici tu... https://t.co/x7Qm1VHBKL,\N,\N,8.96044,46.0027,Earleen.
1,2016-09-15 20:48:05,https://t.co/noYrTnqmg9,\N,\N,8.22414,46.8131,Suisse
2,2016-09-15 20:48:15,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,\N,\N,5.94082,47.201,Fontain
3,2016-09-15 20:48:18,@Mno0or_Abyat اشوف مظاهرات على قانون العمل الج...,\N,\N,6.16552,45.8011,Shargeyah
4,2016-09-15 20:48:18,Greek night #geneve (@ Emilios in Genève) http...,6.14414,46.1966,6.14319,46.2048,İstanbul/Burgazada


In [25]:
sample_tweets['text'].ix[10:15]

10                                                                                                                  seh https://t.co/HDbhb8yVma
11                                                                                                                                Buenas noches
12    Am looking like a 2004 kid I swear you looking like you in your early40s.@Dremoapg she say make I nr touch her you be statue for Museum??
13                     Comunque se sei figa non c'è bisogno di caricare una foto giornaliera sui social, noi maschi non ce lo scordiamo mica eh
14                                                                 Abus sexuels: la gym américaine dans la tourmente... https://t.co/8zEmko2AXK
15                                   @underdeskloser my paypal is insider85@windowslive.com for money request (5$), retweet when u get the cash
Name: text, dtype: object

In [8]:
sample_tweets = sample_tweets[sample_tweets['text'].notnull()]
sample_tweets.isnull().any()

createdAt         False
text              False
longitude          True
latitude           True
placeLongitude     True
placeLatitude      True
userLocation       True
dtype: bool

# Pre-Processing and Cleaning Tweets:

### General Strategy:
1. Parsing and Character Encoding
2. Language Detection
3. Remove Tweets for which geolocation (longitude is not present or where geolocation does not correspond to Switzerland
4. Handling Entities/ Special categories: 
    4.1. Replacing @ instances with <username>
    4.2. Replacing urls with <url>
    4.3. Replacing Emoticons with their word meaning
    4.4. Replacing numbers/phone/fax with <number>
    4.5. Detecting place / city / country / any geolocation cues in any part of the tweet (#)
    4.6. Detecting time cues
5. Tokenization and replacing contractions
6. Part of Speech Tagging to recognize Affective words (Noun, Verbs, Adjectives, Adverb) 
7. Some NRE to replace basic entities like Proper Nouns with tag <proper_noun>
8. Stopwords and punctuation removal
9. (possibly spell-checking as well)
10. Lowering multiple occurences of a character in a word (words like soooooo => so)
11. Lemmatization and term normalization to get less variable versions of the same word. (possibly use thesaurus also) 
12. Remove less frequent words => word count + define a specific threshold

##### N.B: 
TO BE CONTINUED IN OTHER NOTEBOOKS :)