In [52]:
import pandas as pd
import numpy as np
import os

# Cleaning notebook

This notebook serves as a playground for the tweet cleaning. The corresponding script can be found in ```swiss_flows/clean.py```.

## Importing the data

In [53]:
import csv

data_path = os.path.dirname('__file__') + '../data/twitter-swisscom/sample.tsv'

# Taken from Slack: https://adaepfl.slack.com/archives/twitter/p1480527805000002
df = pd.read_csv(data_path, sep="\t",encoding='utf-8',  quoting=csv.QUOTE_NONE, header=None)

df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,se lo dici tu... https://t.co/x7Qm1VHBKL,\N,\N,51c0e6b24c64e54e,\N,1,\t46.0027,8.96044,Twitter for iPhone,http://twitter.com/#!/download/iphone,plvtone filiae.,hazel_chb,146,110,28621,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,https://t.co/noYrTnqmg9,\N,\N,4e7c21fd2af027c6,\N,1,\t46.8131,8.22414,Twitter for iPhone,http://twitter.com/#!/download/iphone,samara,letisieg,755,2037,3771,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,\N,\N,12eb9b254faf37a3,776522113859608576,5,\t47.201,5.94082,Twitter for Android,http://twitter.com/download/android,lebrübrü❤,lebrubru,811,595,30191,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,@Mno0or_Abyat اشوف مظاهرات على قانون العمل الج...,\N,\N,30bcd7f767b4041e,776521597515624448,1,\t45.8011,6.16552,Twitter for iPhone,http://twitter.com/#!/download/iphone,عبدالله القنيص,bingnais,28433,417,12262,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,Greek night #geneve (@ Emilios in Genève) http...,6.14414,46.1966,c3a6437e1b1a726d,\N,3,\t46.2048,6.14319,foursquare,http://foursquare.com,Alkan Şenli,Alkanoli,204,172,3390,İstanbul/Burgazada


In [54]:
# Drop the uneeded columns
df = df.drop(df.columns[[3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]], axis=1)
df.head()

Unnamed: 0,0,1,2,4,5,18
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,\N,\N,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,\N,\N,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,\N,\N,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,\N,\N,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,6.14414,46.1966,İstanbul/Burgazada


In [55]:
# Rename the columns
df.columns = ['id', 'user_id', 'created_at', 'longitude', 'latitude', 'user_location']
df.head()

Unnamed: 0,id,user_id,created_at,longitude,latitude,user_location
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,\N,\N,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,\N,\N,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,\N,\N,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,\N,\N,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,6.14414,46.1966,İstanbul/Burgazada


In [56]:
# Replace the \N characters by None so that pandas can filter out those values
df = df.replace({r'\\N': None}, regex=True)
df.head()

Unnamed: 0,id,user_id,created_at,longitude,latitude,user_location
0,776522983837954049,735449229028675584,2016-09-15 20:48:01,,,Earleen.
1,776523000636203010,2741685639,2016-09-15 20:48:05,,,Suisse
2,776523045200691200,435239151,2016-09-15 20:48:15,,,Fontain
3,776523058404290560,503244217,2016-09-15 20:48:18,,,Shargeyah
4,776523058504925185,452805259,2016-09-15 20:48:18,6.14414,46.1966,İstanbul/Burgazada


In [57]:
# Drop rows which have missing values in important columns
df = df.dropna(subset=['user_id', 'created_at', 'longitude', 'latitude'], how='any')
df.head()

Unnamed: 0,id,user_id,created_at,longitude,latitude,user_location
4,776523058504925185,452805259,2016-09-15 20:48:18,6.14414,46.1966,İstanbul/Burgazada
26,776523388911255552,1142605741,2016-09-15 20:49:37,8.95092,46.006,Lahore
31,776523419261087744,4695376338,2016-09-15 20:49:44,6.81899,47.1003,"La Chaux-de-Fonds, Neuchâtel"
41,776523645514608641,1142605741,2016-09-15 20:50:38,8.94542,45.9915,Lahore
54,776523844483846144,295873403,2016-09-15 20:51:26,5.99278,47.2763,Besançon
