# MEATY Cleaner

In [10]:
from MEATY_backend import *
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import patches as mpatches
import datetime as dt
%matplotlib inline
import queue
import threading
from pytesseract import image_to_string
from PIL import Image
from itertools import combinations as combos
import imagehash as ihash

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))


def get_date(x): return x.date()
def get_dayofweek(x): return x.dayofweek

In [19]:
def hash_img(post_id=None, group_folder=None):
    def make_fn(x): return group_folder + '/imgs/' + x + '.jpg'
    img_fn = make_fn(post_id)
    img = Image.open(img_fn)
    return (post_id, str(ihash.whash(img, hash_size=32)))

In [12]:
def hash_raw_data(group_name='cornell', save_csv=True):
    group_folder = '../' + group_name
    backward_scrape_exists = os.path.exists(
        group_folder + '/temp_memedata_backward.csv')
    forward_scrape_exists = os.path.exists(
        group_folder + '/temp_memedata_forward.csv')
    if backward_scrape_exists:
        dfb = pd.read_csv(group_folder + '/temp_memedata_backward.csv')
    if forward_scrape_exists:
        dff = pd.read_csv(group_folder + '/temp_memedata_forward.csv')
    if forward_scrape_exists and backward_scrape_exists:
        print('--> Found both backward and forward files.')
        df = pd.concat([dfb, dff])
    elif forward_scrape_exists:
        print('--> Found only a forward file.')
        df = dff
    elif backward_scrape_exists:
        print('--> Found only a backward file.')
        df = dfb
    else:
        print('ERROR: Cannot find files!')
        return
    list_ids = df['id'].values
    imhash = [(post_id, str(ihash.whash(Image.open(group_folder + '/imgs/' + post_id + '.jpg'),
                                        hash_size=32)))
              for post_id in list_ids]
    df['post_time'] = pd.DatetimeIndex(df['post_time'])
    df['post_date'] = df['post_time'].apply(get_date)
    df['post_hour'] = df['post_time'].apply(lambda x: x.hour)
    df = dedup_raw_hashed(df)
    df.sort_values(by='post_time', inplace=True)
    hash_df = pd.DataFrame(imhash, columns=['id', 'img_hash'])
    df_hashed = pd.merge(df, hash_df)
    if save_csv:
        df_hashed.to_csv(
            group_folder + '/raw_memedata_hashed.csv', index=False)
    return df_hashed

In [13]:
def load_dedupe_memedata(group_name=None):
    if group_name is None:
        try:
            raw_data_path = 'all_memedata_dedupe.csv'
        except:
            print('ERROR: could not load file')
    else:
        raw_data_path = group_name + '/raw_memedata_hashed.csv'
    raw_data_path = "../" + raw_data_path
    df = pd.read_csv(raw_data_path)
    df['post_time'] = pd.DatetimeIndex(df['post_time'])
    df = dedupe_raw_hashed(df)
    if group_name:
        df['group'] = group_name
    else:
        df['group'] = df.group.str.lstrip('../')
    return df

In [14]:
def dedupe_combine_groups(list_of_groups=['cornell', 'harvard', 'yale', 'princeton',
                                          'columbia', 'dartmouth', 'penn', 'brown'],
                          save_csv=True):

    df_list = threads(8, list_of_groups, hash_raw_data)
    df_all = pd.concat(df_list)
    df_all.set_index('id', inplace=True)
    if save_csv:
        df_all.to_csv('../all_memedata_dedupe.csv')
    return df_all

In [20]:
def aggregate_member_data(list_of_groups=['cornell', 'harvard', 'yale', 'princeton',
                                          'columbia', 'dartmouth', 'penn', 'brown'],
                          save_csv=True):
    def strp_date(x): return datetime.datetime.strptime(x, '%B %d, %Y').date()
    df_list = []
    for group_name in list_of_groups:
        member_data_path = '../' + group_name + '/memberdata.csv'
        df = pd.read_csv(member_data_path, parse_dates=['date'])
        df['group'] = group_name
        df.sort_values('date', inplace=True)
        df_list.append(df)
    df_all = pd.concat(df_list)
    df_all['date'].loc[df_all['date'].str.contains(
        'about', na=False)] = '2017-10-30'
    df_all['date'].loc[df_all['date'] == 'February 4, 2017'] = '2017-02-04'
    df_all['date'] = pd.DatetimeIndex(df_all['date'])
    if save_csv:
        print('saved')
        df_all.to_csv('../all_member_data.csv')
    return df_all

In [21]:
def dedupe_raw_hashed(df):
    df_dedupe = df.drop_duplicates(
        ['img_hash', 'post_time', 'poster_name', 'url'], keep='first')
    return df_dedupe

In [17]:
def reposted_imgs(df):
    rep_im_df = df[df.img_hash.isin
                   (df.img_hash.value_counts(ascending=False)[df.img_hash.value_counts(ascending=False).values >= 4].index
                    )]
    return rep_im_df

In [18]:
def group_size_over_time(df_all=None, list_of_groups=['cornell', 'harvard', 'yale', 'princeton',
                                                      'columbia', 'dartmouth', 'penn', 'brown']):
    if df_all is None:
        df_all = aggregate_member_data()
    count_idx = pd.date_range(df_all['date'].min(), df_all['date'].max())
    dic_size = {}
    for g in list_of_groups:
        gdf = df_all.groupby(['date', 'group']).group.count().groupby('group').cumsum()[:, g].reindex(count_idx,
                                                                                                      method='ffill')
        dic_size[g] = gdf
    size_df = pd.DataFrame(dic_size)
#     size_df[size_df.isna()] = 0
    return size_df

In [85]:
def stackplot_size(count_df, list_of_groups=['harvard', 'columbia', 'yale', 'cornell', 'penn',
                                             'princeton', 'dartmouth', 'brown']):
    stack_list = []
    patch_list = []
    color_ord = []
    for school in list_of_groups:
        print(count_df[school].index.min())
        stack_list.append(count_df[school].values)
        patch_list.append(mpatches.Patch(color=color_dic[school]))
        color_ord.append(color_dic[school])
    fig, ax = plt.subplots()
    ax.stackplot(count_df.index.to_pydatetime(), stack_list, colors=color_ord)
    ax.legend(patch_list, list_of_groups, loc='upper left')
    ax.set_xlim(left=dt.date(2017, 11, 15),
                right=dt.date(2016, 11, 15))
    fig.set_size_inches((20, 12))
    fig.set_dpi(200)
    fig.set_facecolor('lightgray')

In [86]:
#       Requires CSV files locally -> should adapt to postgres 
meme_df = load_dedupe_memedata()
df = aggregate_member_data()

size_df = group_size_over_time()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


saved
saved


In [None]:
# df_list = threads(8, ['cornell','harvard', 'yale', 'princeton', 'columbia', 'dartmouth', 'penn', 'brown'],
#                   hash_raw_data)
# for group_name in 
#     hash_raw_data(group_name)

In [None]:
dupim = reposted_imgs(df_al)
di_counts = dupim.img_hash.value_counts(ascending=False)

In [None]:
dupim.url.values

In [None]:
di_counts

In [None]:
for ih, n in zip(di_counts.index, di_counts.values):
    print(ih[:16], '  ---  ' + str(n))

In [None]:

dfl_d = 
dfl_d = [dedup_raw_hashed(d) for d in df_list]

In [None]:
df_old = pd.read_csv('raw_postdata.csv')

In [None]:
df_old['df'] = 'old'

In [None]:
df_old.drop(['img_text', 'post_dayofweek'], axis=1, inplace=True)

In [None]:
df_old['post_time'] = pd.DatetimeIndex(df_old['post_time'])

In [None]:
dfl = pd.read_csv('../columbia/raw_memedata_hashed-32.csv')
dfl['post_time'] = pd.DatetimeIndex(dfl['post_time'])
dfl['df'] = 'nu'
dfl.drop(['reacts_url', 'thankfuls', 'prides'], axis=1, inplace=True)

In [None]:
df = pd.concat([dfl, df_old])

In [None]:
dfdup = df[df[['img_hash', 'post_time', 'poster_name']].duplicated(keep=False)].sort_values(by='post_time')

In [None]:
dfd = df.drop_duplicates(['img_hash', 'post_time', 'poster_name', 'url'], keep='first')

In [None]:
dfd.shape

In [None]:
odfd = dfd[dfd['df'] == 'old']
# odfd[odfd[['num_reacts', 'post_time', 'poster_name']].duplicated()].img_hash

In [None]:
odfd.shape

In [None]:
# odfd.post_date.value_counts(dropna=False).groupby("post_date")

In [None]:
# post_df[post_df[['post_time', 'poster_name', 'title']].duplicated(keep=False)]