In [1]:
import pandas as pd
import numpy as np
from glob import glob
from collections import defaultdict

import re

In [2]:
DATA_DIRS = ["ChildEducation/Education", "Privacy/Privacy",
             "Vaccine/vaccine",
             "SeatBelt/Seatbelt",
             "GunControl/gunControl",
             "SkinDamage/SkinDamage"
            ]
USER_HANDLE_REGEX = re.compile(r'twitter\.com/(.+)/status/(.+)')
USER_HANDLE_REGEX.findall('http://twitter.com/malkanen/status/12233311')

[('malkanen', '12233311')]

In [3]:
#%%time
datasets = {}
for dirname in DATA_DIRS:
    print dirname
    df = pd.read_csv("%s_processed.csv" % dirname)
    df_orig = pd.read_csv("%s_noDublict.csv" % dirname)
    print df_orig.shape, df.shape
    assert df_orig.shape[0] == df.shape[0], "Shape mismatch between df_orig, and df"
    df_merged = pd.concat([df, df_orig[["URL", "Contents"]]], axis=1)
    print df_merged.shape
    assert df_merged.shape[0] == df.shape[0], "Shape mismatch between df_merged, and df"
    assert ((df_merged.Author != df_merged.URL.apply(
            lambda x: "@%s" % USER_HANDLE_REGEX.findall(x)[0][0])
           ).sum() == 0), "Authors are not the same in merged and other."
    df_merged["t_id"] = df_merged["URL"].apply(lambda x: USER_HANDLE_REGEX.findall(x)[0][1]).astype(int)
    df_merged = df_merged.drop_duplicates(subset=["t_id"])
    print df_merged.shape
    for label_col in ["sentiment", "sentiment_subj", "subjectvity_type", "negation"]:
        print df_merged[label_col].value_counts()
    datasets[dirname] = df_merged

ChildEducation/Education
(10808, 17) (10808, 25)
(10808, 27)
(10808, 28)
Basic Neutral     6652
Basic Positive    2295
Basic Negative    1861
Name: sentiment, dtype: int64
neutral     9861
positive     742
negative     205
Name: sentiment_subj, dtype: int64
neutralsubj    10181
strongsubj       328
weaksubj         299
Name: subjectvity_type, dtype: int64
0    9814
1     994
Name: negation, dtype: int64
Privacy/Privacy
(73593, 17) (73593, 25)
(73593, 27)
(73593, 28)
Basic Neutral     37350
Basic Negative    29103
Basic Positive     7140
Name: sentiment, dtype: int64
neutral     67583
positive     3848
negative     2162
Name: sentiment_subj, dtype: int64
neutralsubj    70078
weaksubj        2105
strongsubj      1410
Name: subjectvity_type, dtype: int64
0    66227
1     7366
Name: negation, dtype: int64
Vaccine/vaccine
(40713, 16) (40713, 25)
(40713, 27)
(40713, 28)
Basic Neutral     26223
Basic Negative    10478
Basic Positive     4012
Name: sentiment, dtype: int64
neutral     37736
pos

In [4]:
def get_user_from_tweet_url(x):
    return "@%s" % USER_HANDLE_REGEX.findall(x)[0][0]

In [5]:
df_meta = pd.read_csv("TID_META.txt", sep="\t", encoding='utf-8')

In [6]:
df_meta.shape

(328318, 21)

In [7]:
df_meta.dtypes

t_id              int64
t_created        object
t_retweets        int64
t_favorites       int64
t_is_reply         bool
t_is_quote         bool
t_n_hashtags      int64
t_n_urls          int64
t_n_mentions      int64
t_n_media         int64
u_id              int64
u_created        object
u_n_listed        int64
u_n_favorites     int64
u_n_followers     int64
u_n_friends       int64
u_n_statuses      int64
u_is_verified      bool
u_location       object
u_name           object
u_url            object
dtype: object

In [8]:
df_merged.dtypes

GUID                float64
Date                 object
processedPost        object
Author               object
Name                 object
State/Region         object
City/Urban           object
sentiment            object
Gender               object
Posts                 int64
Followers             int64
Following             int64
RT?                  object
mention?             object
hashtag?             object
URL?                 object
noun                  int64
verb                  int64
adjective             int64
preposition           int64
adverb                int64
sentiment_subj       object
subjectvity_type     object
negation              int64
count_tweet           int64
URL                  object
Contents             object
t_id                  int64
dtype: object

In [9]:
df_meta.columns

Index([u't_id', u't_created', u't_retweets', u't_favorites', u't_is_reply',
       u't_is_quote', u't_n_hashtags', u't_n_urls', u't_n_mentions',
       u't_n_media', u'u_id', u'u_created', u'u_n_listed', u'u_n_favorites',
       u'u_n_followers', u'u_n_friends', u'u_n_statuses', u'u_is_verified',
       u'u_location', u'u_name', u'u_url'],
      dtype='object')

In [10]:
df_merged_meta = df_merged.merge(df_meta, how="left", on="t_id")
df_merged_meta.shape

(14128, 48)

In [11]:
df_merged.shape

(14128, 28)

In [12]:
df_all = pd.concat([v.assign(topic_name=lambda x: k) for k,v in datasets.iteritems()], axis=0)
df_all.shape

(246869, 30)

In [13]:
df_all.topic_name.value_counts()

Privacy/Privacy             73593
SeatBelt/Seatbelt           73270
Vaccine/vaccine             40713
GunControl/gunControl       34357
SkinDamage/SkinDamage       14128
ChildEducation/Education    10808
Name: topic_name, dtype: int64

In [14]:
df_merged_meta = df_all.merge(df_meta, how="left", on="t_id")
df_merged_meta.shape

(246885, 50)

In [15]:
df_merged_meta["is_controvertial"] = df_merged_meta.topic_name.isin(["Privacy/Privacy", "Vaccine/vaccine"])
df_merged_meta.is_controvertial.value_counts()

False    132573
True     114312
Name: is_controvertial, dtype: int64

## Merge URL types

In [16]:
df_merged_meta.columns

Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favo

In [17]:
df_mapped_cats = pd.read_csv("TID_URL_CATS.txt", sep="\t").assign(
    CATS=lambda x: x.CATS.apply(lambda k: k.split("|"))
)
df_mapped_cats.head()

Unnamed: 0,TID,URL,CATS
0,682904901916225536,https://twitter.com/photogchad_WTSP/status/682...,"[socialmedia, twitter]"
1,682915876316692480,http://www.investirdanslenfance.ca/,[UNK]
2,682985833821941760,http://TinyURL.com/NewYearCure,[commercial]
3,682952771746664448,http://TinyURL.com/NewYearCure,[commercial]
4,682830450969059328,http://yournewswire.com/donald-trump-vaccines-...,[fakenews]


In [18]:
URL_DICT = dict(zip(df_mapped_cats.URL.values, df_mapped_cats.CATS.values))
URL_DICT["http://TinyURL.com/NewYearCure"]

['commercial']

In [19]:
len(URL_DICT)

119558

In [20]:
df_mapped_cats.TID.value_counts().head()

770862566662955008    3
700152617033289728    3
735430291821596673    3
738960203949965312    3
716003978807476224    3
Name: TID, dtype: int64

In [21]:
df_mapped_cats[df_mapped_cats.TID == 700152617033289728]

Unnamed: 0,TID,URL,CATS
17849,700152617033289728,http://www.hcplive.com/contributor/ryan-gray-m...,[UNK]
17850,700152617033289728,http://www.foxla.com/news/local-news/89941411-...,[UNK]
17851,700152617033289728,http://www.apple.com/customer-letter/,[commercial]


In [22]:
df_tweet_cat_counts = df_mapped_cats.groupby("TID")["CATS"].apply(lambda x: sum(x, []))
df_tweet_cat_counts.head()

TID
682807892517478400                     [UNK]
682808408857272320    [socialmedia, twitter]
682809868449796099                     [UNK]
682815454159761408    [socialmedia, twitter]
682816642242490369          [news, fakenews]
Name: CATS, dtype: object

In [23]:
df_tweet_cat_counts.reset_index().dtypes

TID      int64
CATS    object
dtype: object

In [24]:
df_merged_meta_cats = df_merged_meta.merge(
    df_tweet_cat_counts.reset_index(), how="left", left_on="t_id", right_on="TID")

In [25]:
df_merged_meta_cats.columns

Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favo

## Add location states

```
df_merged_meta_cats.u_location.value_counts().to_csv("USER_LOCATIONS.txt", sep="\t", encoding='utf-8')
! head USER_LOCATIONS.txt
! python process_user_locations.py ## RUN using python3 from command line
```

In [26]:
df_places = pd.read_csv("PARSED_STATES.final.txt", sep="\t")
df_places = df_places.rename(columns={
        "location": "u_location", "parse_manual": "u_state"
                                     })[["u_location", "u_state"]]
df_places.head()

Unnamed: 0,u_location,u_state
0,United States,USA
1,USA,USA
2,"Washington, DC",DC
3,"New York, NY",NY
4,"Los Angeles, CA",CA


In [27]:
df_merged_meta_cats = df_merged_meta_cats.merge(df_places, how="left", on="u_location")

  rlab = rizer.factorize(rk)


In [28]:
df_merged_meta_cats.u_state.head()

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: u_state, dtype: object

In [29]:
df_merged_meta_cats.to_hdf("FINAL_ANALYSIS_DATA.h5", "final_data")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->axis0] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_items] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['Author', 'City/Urban', 'City/Urban Area', 'Contents', 'Date', 'Gender', 'Name', 'RT?', 'State/Region', 'URL', 'URL?', 'hashtag?', 'mention?', 'processedPost', 'sentiment', 'sentiment_subj', 'subjectvity_type', 'topic_name', u't_created', u't_is_reply', u't_is_quote', u'u_created', u'u_is_verified', u'u_location', u'u_name', u'u_url', 'CATS', 'u_state']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
your performance may suffer as PyTables will pickle object types that it cannot
map directly 