### Raw Tweets Size Partition Worksheet

In [12]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import os
from tqdm import tqdm_notebook

In [2]:
with open('tweet_df.pkl', 'rb') as filehandle:  
    # read the data as binary data stream
    df_tweets = pickle.load(filehandle)
df_tweets.head()

Unnamed: 0,id,date,text,author,favorite,url
0,340609093125423104,2013-05-31 16:21:50,Paper wallets are the best way to store # bitc...,aantonop,0,https://twitter.com/aantonop/status/3406090931...
1,340578313456218112,2013-05-31 14:19:32,@ LetsTalkBitcoin Thank you. So new to twitter...,aantonop,0,https://twitter.com/aantonop/status/3405783134...
2,340552331110273024,2013-05-31 12:36:17,@ edla @ BLOUIN_NEWS @ swardley @ Archimedius ...,aantonop,0,https://twitter.com/aantonop/status/3405523311...
3,340551805593329667,2013-05-31 12:34:12,@ edla @ BLOUIN_NEWS @ swardley @ Archimedius ...,aantonop,0,https://twitter.com/aantonop/status/3405518055...
4,340541494295404545,2013-05-31 11:53:13,@ BLOUIN_NEWS @ edla @ swardley @ Archimedius ...,aantonop,0,https://twitter.com/aantonop/status/3405414942...


In [3]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2563803 entries, 0 to 70
Data columns (total 6 columns):
id          object
date        datetime64[ns]
text        object
author      object
favorite    int64
url         object
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 136.9+ MB


In [4]:
df_tweets = df_tweets.sort_values(by='date')
df_tweets = df_tweets.reset_index(drop=True)
df_tweets.head()

Unnamed: 0,id,date,text,author,favorite,url
0,1089186195,2008-12-31 17:48:08,I just finished doing Wii Fit Yoga and really ...,billbarhydt,0,https://twitter.com/billbarhydt/status/1089186195
1,1089432998,2008-12-31 20:52:39,Champagne tasting on the roof,jack,0,https://twitter.com/jack/status/1089432998
2,1089472226,2008-12-31 21:11:51,happy new years nycers.,TheStalwart,0,https://twitter.com/TheStalwart/status/1089472226
3,1089486321,2008-12-31 21:20:26,Happy New Year east coast peeps!,billbarhydt,0,https://twitter.com/billbarhydt/status/1089486321
4,1089486628,2008-12-31 21:20:39,Happy new year! Prime rib was an utter success!,garrytan,0,https://twitter.com/garrytan/status/1089486628


In [5]:
# clear any duplicate rows
df_tweets = df_tweets.drop_duplicates()
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2563803 entries, 0 to 2563802
Data columns (total 6 columns):
id          object
date        datetime64[ns]
text        object
author      object
favorite    int64
url         object
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 136.9+ MB


In [6]:
# clean up column names
df_t2 = df_tweets.rename(index=str, columns={"id": "tweet_id", "author": "screen_name"}).copy()
df_t2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2563803 entries, 0 to 2563802
Data columns (total 6 columns):
tweet_id       object
date           datetime64[ns]
text           object
screen_name    object
favorite       int64
url            object
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 136.9+ MB


In [18]:
# add year and month fields for storage
df_t2['year'] = df_t2['date'].dt.year
df_t2['month'] = df_t2['date'].dt.month
df_t2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2563803 entries, 0 to 2563802
Data columns (total 8 columns):
tweet_id       object
date           datetime64[ns]
text           object
screen_name    object
favorite       int64
url            object
year           int64
month          int64
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 256.0+ MB


In [19]:
# partition data by year and month and store
root = 'raw_data'
for y in tqdm_notebook(range(2008,2020)):
    for m in range(1,13):
        
        df_slice = df_t2[(df_t2['year'] == y)&(df_t2['month'] == m)].copy()
        df_slice = df_slice.drop(['year', 'month'], axis=1)
        
        filename = 'raw_tweets_{}_{}.pkl'.format(y,m)
        filestring = os.path.join(root, filename)
        with open(filestring, 'wb') as filehandle:  
            # store the data
            pickle.dump(df_slice, filehandle)
        

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [20]:
# reload dataframe as data quality check
df_reload = pd.DataFrame()

root = 'raw_data'
for y in tqdm_notebook(range(2008,2020)):
    for m in range(1,13):
        
        filename = 'raw_tweets_{}_{}.pkl'.format(y,m)
        filestring = os.path.join(root, filename)
        
        with open(filestring, 'rb') as filehandle:  
            # read the data as binary data stream
            df_slice = pickle.load(filehandle)
            
        df_reload = pd.concat([df_reload, df_slice], ignore_index=True)

df_reload.info()

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2563803 entries, 0 to 2563802
Data columns (total 6 columns):
tweet_id       object
date           datetime64[ns]
text           object
screen_name    object
favorite       int64
url            object
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 117.4+ MB


In [21]:
df_reload.sample(10)

Unnamed: 0,tweet_id,date,text,screen_name,favorite,url
21557,1985026127,2009-05-31 16:16:47,@ mmilian ha!,jack,1,https://twitter.com/jack/status/1985026127
1297812,813631496863019008,2016-12-26 22:24:00,I've been feeling quite sad about the crash of...,el33th4xor,14,https://twitter.com/el33th4xor/status/81363149...
511069,385855053782740992,2013-10-03 12:53:08,@ BitcoinSteve @ shapeways I just wanted to re...,BitPay,0,https://twitter.com/BitPay/status/385855053782...
1013927,661305566967562240,2015-11-02 14:15:08,Are your readers having trouble understanding ...,Snowden,4428,https://twitter.com/Snowden/status/66130556696...
91481,15875634050,2010-06-10 13:16:46,Hey! Maria Bartiromo just asked Angelides abou...,TheStalwart,0,https://twitter.com/TheStalwart/status/1587563...
251892,159681513796018176,2012-01-18 09:00:14,Unified's social operating platform brings ent...,michaelterpin,0,https://twitter.com/michaelterpin/status/15968...
223925,131056744263712768,2011-10-31 10:15:37,@ javierota @ mmustapic alcanza con leer los a...,santisiri,0,https://twitter.com/santisiri/status/131056744...
2375146,1094983350396293120,2019-02-11 07:35:49,sorry then if i wasn’t clear. hoped the parent...,santisiri,1,https://twitter.com/santisiri/status/109498335...
147814,40894300644442112,2011-02-24 14:02:15,RT @ dashb0t: EXCLUSIVE: MUST CREDIT SPORTS PA...,TheStalwart,0,https://twitter.com/TheStalwart/status/4089430...
185480,92541648453640192,2011-07-17 03:30:22,Freedom of speech in the age of WikiLeaks http...,wikileaks,34,https://twitter.com/wikileaks/status/925416484...
