In [2]:
import pandas as pd
import numpy as np
import os

# Cleaning notebook

This notebook serves as a playground for the tweet cleaning. The corresponding script can be found in ```swiss_flows/clean_tweets.py```.

## Importing the data

In [28]:
import csv

data_path = os.path.dirname('__file__') + '../data/twitter-swisscom/sample.tsv'

# Taken from Slack: https://adaepfl.slack.com/archives/twitter/p1480527805000002
df = pd.read_csv(data_path, sep="\t",encoding='utf-8',  escapechar='\\', quoting=csv.QUOTE_NONE, header=None, na_values='N')

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,se lo dici tu... https://t.co/x7Qm1VHBKL,,,51c0e6b24c64e54e,,1,,46.0027,8.96044,Twitter for iPhone,http://twitter.com/#!/download/iphone,plvtone filiae.,hazel_chb,146,110,28621,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,https://t.co/noYrTnqmg9,,,4e7c21fd2af027c6,,1,,46.8131,8.22414,Twitter for iPhone,http://twitter.com/#!/download/iphone,samara,letisieg,755,2037,3771,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,,,12eb9b254faf37a3,7.765221e+17,5,,47.201,5.94082,Twitter for Android,http://twitter.com/download/android,lebrübrü❤,lebrubru,811,595,30191,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,@Mno0or_Abyat اشوف مظاهرات على قانون العمل الج...,,,30bcd7f767b4041e,7.765216e+17,1,,45.8011,6.16552,Twitter for iPhone,http://twitter.com/#!/download/iphone,عبدالله القنيص,bingnais,28433,417,12262,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,Greek night #geneve (@ Emilios in Genève) http...,6.14414,46.1966,c3a6437e1b1a726d,,3,,46.2048,6.14319,foursquare,http://foursquare.com,Alkan Şenli,Alkanoli,204,172,3390,İstanbul/Burgazada


Now we will filter out the columns that are not needed. In order to do this, we first need to know the column names, so we use ```schema.txt```.

In [22]:
schema_path = os.path.dirname('__file__') + '../data/twitter-swisscom/schema.txt'

# Load the schema
schema = pd.read_csv(schema_path, delim_whitespace=True, header=None)
schema

Unnamed: 0,0,1,2,3,4,5
0,1,id,bigint(20),UNSIGNED,No,
1,2,userId,bigint(20),UNSIGNED,No,
2,3,createdAt,timestamp,No,0000-00-00,00:00:00
3,4,text,text,utf8_unicode_ci,No,
4,5,longitude,float,Yes,,
5,6,latitude,float,Yes,,
6,7,placeId,varchar(25),utf8_general_ci,Yes,
7,8,inReplyTo,bigint(20),UNSIGNED,Yes,
8,9,source,int(10),UNSIGNED,No,
9,10,truncated,bit(1),No,,


In [29]:
# Assign column names to our dataframe
df.columns = schema[1]
df.head()

1,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,se lo dici tu... https://t.co/x7Qm1VHBKL,,,51c0e6b24c64e54e,,1,,46.0027,8.96044,Twitter for iPhone,http://twitter.com/#!/download/iphone,plvtone filiae.,hazel_chb,146,110,28621,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,https://t.co/noYrTnqmg9,,,4e7c21fd2af027c6,,1,,46.8131,8.22414,Twitter for iPhone,http://twitter.com/#!/download/iphone,samara,letisieg,755,2037,3771,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,,,12eb9b254faf37a3,7.765221e+17,5,,47.201,5.94082,Twitter for Android,http://twitter.com/download/android,lebrübrü❤,lebrubru,811,595,30191,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,@Mno0or_Abyat اشوف مظاهرات على قانون العمل الج...,,,30bcd7f767b4041e,7.765216e+17,1,,45.8011,6.16552,Twitter for iPhone,http://twitter.com/#!/download/iphone,عبدالله القنيص,bingnais,28433,417,12262,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,Greek night #geneve (@ Emilios in Genève) http...,6.14414,46.1966,c3a6437e1b1a726d,,3,,46.2048,6.14319,foursquare,http://foursquare.com,Alkan Şenli,Alkanoli,204,172,3390,İstanbul/Burgazada


In [30]:
# Keep only the useful columns
useful_col = ['id', 'userId', 'createdAt', 'placeLongitude', 'placeLatitude', 'userLocation']
df = df[useful_col]
df.head()

1,id,userId,createdAt,placeLongitude,placeLatitude,userLocation
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,8.96044,46.0027,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,8.22414,46.8131,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,5.94082,47.201,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,6.16552,45.8011,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,6.14319,46.2048,İstanbul/Burgazada


In [31]:
# Drop rows which have missing values in important columns
imp_col = ['userId', 'createdAt', 'placeLatitude', 'placeLatitude']
df = df.dropna(subset=imp_col, how='any')
df.head()

1,id,userId,createdAt,placeLongitude,placeLatitude,userLocation
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,8.96044,46.0027,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,8.22414,46.8131,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,5.94082,47.201,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,6.16552,45.8011,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,6.14319,46.2048,İstanbul/Burgazada
