In [1]:
import pandas as pd
import fasttext

# See more text in the output:
pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)

data_path = os.path.normpath(os.getcwd() + os.sep + os.pardir) + '/data/'

import vienna_paper.prep_data
import vienna_paper.tidylopy

%load_ext autoreload
%autoreload 2

### Import data

In [2]:
# Import and concatenate data:
innere = pd.read_csv(data_path + 'raw/1 Innere Stadt.csv', low_memory=False)
innere['district_number'] = '1'
innere['district_name'] = 'Innere Stadt'

josef = pd.read_csv(data_path + 'raw/8 Josefstadt.csv')
josef['district_number'] = '8'
josef['district_name'] = 'Josefstadt'

ottakring = pd.read_csv(data_path + 'raw/16 Ottakring.csv')
ottakring['district_number'] = '16'
ottakring['district_name'] = 'Ottakring'

wahring = pd.read_csv(data_path + 'raw/18 Währing.csv')
wahring['district_number'] = '18'
wahring['district_name'] = 'Wahring'

dobling = pd.read_csv(data_path + 'raw/19 Döbling.csv')
dobling['district_number'] = '19'
dobling['district_name'] = 'Dobling'

floridsdorf = pd.read_csv(data_path + 'raw/21 Floridsdorf.csv')
floridsdorf['district_number'] = '21'
floridsdorf['district_name'] = 'Floridsdorf'

working_set = pd.concat([innere, josef, ottakring, wahring, dobling, floridsdorf])

### Clean data

In [64]:
# Clean up urls, @ mentions and # symbol, emojis (keeps text of hashtags), via https://aronakhmad.medium.com/twitter-data-cleaning-using-python-db1ec2f28f08
working_set['text_clean'] = working_set['text'].str.replace('&amp;', '', regex=False)
working_set['text_clean'] = working_set['text_clean'].str.replace(r'(@[A-Za-z0–9_]+)|[^\w\s]|#|http\S+', '', regex=True)
# working_set['text_clean'] = working_set['text_clean'].str.replace(r'(@[A-Za-z0–9_]+)|[^\w\s#]|http\S+', '', regex=True)  # Keep hashtags?

# Get rid of these canned posts:
working_set['text_clean'] = working_set['text_clean'].str.replace('Just posted a photo ', '', regex=False)
# Replace new lines with spaces:
working_set['text_clean'] = working_set['text_clean'].str.replace('\n', ' ', regex=False)

### Label languages
More accurate than Twitter's in-built method which uses profile language I think...

In [45]:
# Load language model -- using fasttext
lang_detect_model = fasttext.load_model(data_path + 'external/lid.176.bin')
# Label each comment with language
working_set['text_language'] = working_set['text_clean'].apply(
    lambda x: lang_detect_model.predict(x, k=1)[0][0])

# working_set[['text_language', 'text', 'text_clean']].sample(20)  # Check out results




### Explore data

In [92]:
# For nice tables:
working_set['District'] = working_set['district_number'] + ' - ' + working_set['district_name']

In [47]:
# Overall counts
overall_counts = working_set.groupby('District')['text_clean'].count().reset_index().sort_values('text_clean', ascending=False).rename(columns={'text_clean': 'overall_district_count'})

In [None]:
# Top five languages of Tweets per district
working_set.groupby(['district_number','district_name', 'text_language'])['text_clean'].count().reset_index().sort_values(['district_number', 'text_clean'], ascending=[1, 0]).groupby('district_number').head(5).reset_index(drop=True)

In [24]:
# Copy to clipboard & get percentages
working_set['district_number'] = working_set['district_number'].astype(int)  # to make sort nice
table_df = working_set.groupby(['district_number','District', 'text_language'])['text_clean'].count().reset_index().sort_values(['district_number', 'text_clean'], ascending=[1, 0]).groupby('district_number').head(5).reset_index(drop=True).drop(columns = ['district_number']).merge(overall_counts, on = 'District', how = 'left')
table_df['percent of district'] = table_df['text_clean'] / table_df['overall_district_count']
table_df.to_clipboard()

### Weighted log odds

- What were the most distinctive LANGUAGES per district? What were the most distinctive TERMS / sets of TERMS?

In [49]:
# Languages
lang_wlo_df = working_set.groupby(['District', 'text_language'])['text_clean'].count().reset_index().rename(columns={'text_clean':'n'})
lang_wlo = vienna_paper.tidylopy.get_weighted_log_odds(lang_wlo_df, 'District', 'text_language', 'n')

In [None]:
# lang_wlo.groupby('District').head(5).reset_index()[['District', 'text_language', 'log_odds_weighted']].sort_values(['District', 'log_odds_weighted'], ascending = [1, 0]).to_clipboard()
lang_wlo.groupby('District').head(5).reset_index()[['District', 'text_language', 'n', 'log_odds_weighted']].sort_values(['District', 'log_odds_weighted'], ascending = [1, 0]) #.to_clipboard()

In [None]:
working_set[working_set['text_language'] == '__label__tl'][['text']]

Weighted log odds of unigrams (words)

In [105]:
# Unigrams / bigrams: -- start with unigrams first maybe
# First transform data into unigrams / bigrams!
working_set['unigrams'] = vienna_paper.prep_data.get_unigrams(working_set, 'text_clean')

In [106]:
# First get in right format to get most distinctive unigrams:
working_set['unigrams'] = working_set['unigrams'].astype(str)
working_set['unigrams'] = working_set['unigrams'].str.strip('[]').str.replace("'", "")
working_set_long = working_set.assign(unigrams=working_set['unigrams'].str.split(',')).explode('unigrams')
working_set_long['unigrams'] = working_set_long['unigrams'].str.replace(' ', '').str.lower()
working_set_long = working_set_long[working_set_long['unigrams']!='']
# working_set_long = working_set_long[~working_set_long['unigrams'].str.isnumeric()]

In [107]:
# Unigrams
unigram_wlo_df = working_set_long.groupby(['district_name', 'unigrams'])['text_clean'].count().reset_index().rename(columns={'text_clean':'n'})
# Experiment -- get rid of vienna, wien, austria
unigram_wlo_df = unigram_wlo_df[~unigram_wlo_df['unigrams'].isin(['vienna', 'wien', 'austria', 'st', 'wiener'])]
# unigram_wlo_df = unigram_wlo_df[unigram_wlo_df['n']>1]
unigram_wlo = vienna_paper.tidylopy.get_weighted_log_odds(unigram_wlo_df, 'district_name', 'unigrams', 'n')

In [98]:
# Look at top 5 words per district:
freq_wlo_df = working_set_long.groupby(['District', 'unigrams'])['text_clean'].count().reset_index().rename(columns={'text_clean':'n'})
# Experiment -- get rid of vienna, wien, austria
freq_wlo_df = freq_wlo_df[~freq_wlo_df['unigrams'].isin(['vienna', 'wien', 'austria', 'st', 'wiener'])]


In [85]:
unigram_wlo.head(20)

Unnamed: 0,district_name,unigrams,n,alpha,y_wi,y_w,n_i,omega_wi,omega_w,log_odds,sigma2_wi,log_odds_weighted
28462,Wahring,vienna,19,7194,7213,50358,31862,0.292629,0.15335,0.646184,0.000158,51.327069
27611,Ottakring,vienna,68,7194,7262,50358,37492,0.240225,0.15335,0.448856,0.000158,35.758767
26420,Josefstadt,vienna,99,7194,7293,50358,40116,0.222192,0.15335,0.370821,0.000157,29.597021
27847,Wahring,austria,11,2700,2711,18900,31862,0.092999,0.052523,0.57134,0.000422,27.819718
28487,Wahring,wien,25,2322,2347,16254,31862,0.079519,0.04484,0.5729,0.000488,25.944619
27240,Ottakring,museum,2,650,652,2600,37492,0.017698,0.006912,0.940168,0.001918,21.465491
26041,Josefstadt,museum,4,650,654,2600,40116,0.016573,0.006912,0.874477,0.001914,19.990113
26623,Ottakring,austria,20,2700,2720,18900,37492,0.078224,0.052523,0.398331,0.000421,19.423678
27656,Ottakring,wien,26,2322,2348,16254,37492,0.066811,0.04484,0.398771,0.000487,18.062277
28358,Wahring,st,2,497,499,2485,31862,0.01591,0.006604,0.87923,0.002406,17.923235


In [72]:
# Follow up
# working_set[working_set['district_name'] == 'Dobling'][['text_clean', 'text']]
working_set[(working_set['text_clean'].str.contains('Stephens')) & (working_set['district_name'] == 'Dobling')]

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,is_retweet,favorite_count,retweet_count,quote_count,reply_count,hashtags,symbols,urls_url,urls_t.co,urls_expanded_url,media_url,media_t.co,media_expanded_url,media_type,ext_media_url,ext_media_t.co,ext_media_expanded_url,ext_media_type,mentions_user_id,mentions_screen_name,lang,quoted_status_id,quoted_text,quoted_created_at,quoted_source,quoted_favorite_count,quoted_retweet_count,quoted_user_id,quoted_screen_name,quoted_name,quoted_followers_count,quoted_friends_count,quoted_statuses_count,quoted_location,quoted_description,quoted_verified,retweet_status_id,retweet_text,retweet_created_at,retweet_source,retweet_favorite_count,retweet_retweet_count,retweet_user_id,retweet_screen_name,retweet_name,retweet_followers_count,retweet_friends_count,retweet_statuses_count,retweet_location,retweet_description,retweet_verified,place_url,place_name,place_full_name,place_type,country,country_code,geo_coords,coords_coords,bbox_coords,status_url,name,location,description,url,protected,followers_count,friends_count,listed_count,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url,lat,lng,in_district,district_number,district_name,text_clean,text_language,District,unigrams
116,x274654303,x595723218134138880,2015-05-05 22:54:17,AdamPlotkin,"#Nussberger #Vineyard, mere 20 mins from #Vienna city center! St. Stephen's Cathedral in distance.… https://t.co/sWNHwEWIMO",Instagram,,,,,False,False,0,0,0,0,Nussberger Vineyard Vienna,,instagram.com/p/2UVa2kolz3/,https://t.co/sWNHwEWIMO,https://instagram.com/p/2UVa2kolz3/,,,,,,,,,,,en,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,https://api.twitter.com/1.1/geo/id/9f659d51e5c5deae.json,Vienna,"Vienna, Austria",city,Austria,AT,48.26551228 16.35072411,16.35072411 48.26551228,16.18218 16.18218 16.577511 16.577511 48.117666 48.322574 48.322574 48.117666,https://twitter.com/AdamPlotkin/status/595723218134138880,Adam Plotkin,"Chasing powder, food, & wine!","Husband & father, attorney, entrepreneur, die-hard powder hound, CMS Certified Sommelier, sports/political junkie, speaker re health law & collection law.",,False,1790,977,168,22591,6976,2011-03-30 18:42:49,False,,,,https://pbs.twimg.com/profile_banners/274654303/1354647164,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/469269941271527424/iziQNEdC_normal.jpeg,48.265512,16.350724,Yes,19,Dobling,Nussberger Vineyard mere 20 mins from Vienna city center St Stephens Cathedral in distance,__label__en,19 - Dobling,"Nussberger, Vineyard, mere, 20, min, Vienna, city, center, St, Stephens, Cathedral, distance"


In [97]:
unigram_wlo.head()

Unnamed: 0,district_name,unigrams,n,alpha,y_wi,y_w,n_i,omega_wi,omega_w,log_odds,sigma2_wi,log_odds_weighted
27266,Ottakring,museum,2,648,650,2592,25152,0.026528,0.008926,1.089233,0.001924,24.830678
28381,Wahring,st,2,497,499,2485,19583,0.026148,0.008555,1.117297,0.002406,22.776275
26068,Josefstadt,museum,4,648,652,2592,27672,0.02413,0.008926,0.994483,0.00192,22.698554
27339,Ottakring,palace,1,266,267,798,25152,0.010729,0.002731,1.368235,0.004998,19.352757
1479,Dobling,stephens,1,434,435,1302,35743,0.01232,0.004464,1.015213,0.003067,18.331895


In [137]:
working_set[['lat', 'lng']].head()
working_set.groupby(['District', 'lat', 'lng']).count().reset_index().sort_values(['district_number', 'text_clean'], ascending = [1,0]).groupby('District').head(1)

Unnamed: 0,District,lat,lng,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,is_retweet,favorite_count,retweet_count,quote_count,reply_count,hashtags,symbols,urls_url,urls_t.co,urls_expanded_url,media_url,media_t.co,media_expanded_url,media_type,ext_media_url,ext_media_t.co,ext_media_expanded_url,ext_media_type,mentions_user_id,mentions_screen_name,lang,quoted_status_id,quoted_text,quoted_created_at,quoted_source,quoted_favorite_count,quoted_retweet_count,quoted_user_id,quoted_screen_name,quoted_name,quoted_followers_count,quoted_friends_count,quoted_statuses_count,quoted_location,quoted_description,quoted_verified,retweet_status_id,retweet_text,retweet_created_at,retweet_source,retweet_favorite_count,retweet_retweet_count,retweet_user_id,retweet_screen_name,retweet_name,retweet_followers_count,retweet_friends_count,retweet_statuses_count,retweet_location,retweet_description,retweet_verified,place_url,place_name,place_full_name,place_type,country,country_code,geo_coords,coords_coords,bbox_coords,status_url,name,location,description,url,protected,followers_count,friends_count,listed_count,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url,in_district,district_number,district_name,text_clean,text_language,unigrams,bigrams
0,1 - Innere Stadt,48.1998,16.373549,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1
1738,16 - Ottakring,48.205037,16.336507,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1
1834,18 - Wahring,48.217491,16.341445,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1
1910,19 - Dobling,48.232335,16.35368,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1
2090,21 - Floridsdorf,48.238428,16.397278,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1
2273,8 - Josefstadt,48.206575,16.348696,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1


In [108]:
# Top five distinctive unigrams of Tweets per district
top_wlo_unigrams_district = unigram_wlo.sort_values(['district_name', 'log_odds_weighted'], ascending=[1, 0]).groupby('district_name').head(5).reset_index(drop=True)[['district_name', 'unigrams', 'n', 'log_odds_weighted']]
# Assign lon / lat for graphing:
coord_dict_lat = {'Innere Stadt': '48.209037', 'Josefstadt': '48.211305', 'Ottakring': '48.214644', 'Wahring': '48.228309', 'Dobling': '48.247982', 'Floridsdorf': '48.262683'}
coord_dict_lng = {'Innere Stadt': '16.370868', 'Josefstadt': '16.347494', 'Ottakring': '16.302710', 'Wahring': '16.340726', 'Dobling': '16.342478', 'Floridsdorf': '16.402956'}

top_wlo_unigrams_district['lat'] = top_wlo_unigrams_district['district_name'].map(coord_dict_lat)
top_wlo_unigrams_district['lng'] = top_wlo_unigrams_district['district_name'].map(coord_dict_lng)
top_wlo_unigrams_district.to_csv(data_path + 'processed/district_unigram_wlo.csv', index=False)


In [118]:
# Look at top 5 words per district:
freq_wlo_df = working_set_long.groupby(['district_name', 'unigrams'])['text_clean'].count().reset_index().rename(columns={'text_clean':'n'})
# Experiment -- get rid of vienna, wien, austria
freq_wlo_df = freq_wlo_df[~freq_wlo_df['unigrams'].isin(['vienna', 'wien', 'austria', 'st', 'wiener', 'ich', 'der', 'und', 'auf', 'ein', 'die'])]

top_freq_unigrams_district = freq_wlo_df.sort_values(['district_name', 'n'], ascending=[1, 0]).groupby('district_name').head(5).reset_index(drop=True)[['district_name', 'unigrams', 'n']]
top_freq_unigrams_district['lat'] = top_freq_unigrams_district['district_name'].map(coord_dict_lat)
top_freq_unigrams_district['lng'] = top_freq_unigrams_district['district_name'].map(coord_dict_lng)
top_freq_unigrams_district.to_csv(data_path + 'processed/district_unigram_freq.csv', index=False)

In [117]:
top_freq_unigrams_district

Unnamed: 0,district_name,unigrams,n,lat,lng
0,Dobling,kahlenberg,80,48.247982,16.342478
1,Dobling,hohe,66,48.247982,16.342478
2,Dobling,warte,56,48.247982,16.342478
3,Dobling,stadion,49,48.247982,16.342478
4,Dobling,girl,47,48.247982,16.342478
5,Floridsdorf,rock,44,48.262683,16.402956
6,Floridsdorf,donauinselfest,42,48.262683,16.402956
7,Floridsdorf,ihr,37,48.262683,16.402956
8,Floridsdorf,rockinvienna,29,48.262683,16.402956
9,Floridsdorf,donauinsel,25,48.262683,16.402956
