In [1]:
import operator, re
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from neurotools.io.files import load_embedding_model
from neurotools.io.gdrive import download_sheets_doc
from neurotools.plot import figure
from sklearn.feature_extraction.text import TfidfVectorizer
from neurotools.language.simple_tokenizer import SimpleTokenizer
from wordcloud import WordCloud
from googletrans import Translator
import itertools
import ast
%matplotlib inline

In [2]:
tokenizer = SimpleTokenizer('de') 

Extra characters read for de
Stopwords set read for de
Lemma lookup read for de


In [3]:
emb, word2rank_dict = load_embedding_model('../utils/002_de_de_commoncrawl_v1_20171209.bin')

In [4]:
def get_average_vector(words, emb):
    """
        Average the word vectors of a list of words.
    """
    words_in_emb = [emb[word] for word in words if word in emb.vocab.keys()]
    return np.mean(words_in_emb, axis=0)

In [5]:
def get_idf_map(documents):
    tfidf = TfidfVectorizer(use_idf=True)
    tfidf.fit_transform(documents)
    word_to_idf = dict((x[0], tfidf.idf_[x[1]]) for x in sorted(tfidf.vocabulary_.items(), key=operator.itemgetter(1)))
    return word_to_idf

In [6]:
def get_idf_weighted_average_vector(words, word_to_idf, emb):
    """
        Average the idf-weighted word vectors of a list of words. (Map this function to every document)
    """
    # only consider words IN embedding model and in TFIDF vectorizer
    words = [word for word in words if word in word_to_idf.keys() and word in emb.vocab.keys()]
    weighted = [word_to_idf[word] * emb[word] for word in words]
    return np.mean(weighted, axis=0)

In [7]:
def cosine_similarity(a,b):
    """
        Compute the cosine similarity of a and b.
    """
    return np.dot(a,b) / ( (np.dot(a,a) **.5) * (np.dot(b,b) ** .5) )

In [8]:
def get_similarity_with_dimensions(row, dimensions):
    """
        Get similarity of row's word vector and every dimension.
        row : row of the movie dataframe
        dimensions : dataframe containing Neuroflash dimensions
    """
    for k in range(dimensions['dimension'].shape[0]):
        name = str(dimensions['label'].iloc[k])
        # define a new column for the cosine similarity of dimension k and the post text
        row[name] = cosine_similarity(row['wv'], dimensions['dimension'].iloc[k])
    return row

In [9]:
dimensions = pd.read_csv('../data/all_dimensions.csv')
dimensions.dimension = dimensions.dimension.map(lambda x: np.array(ast.literal_eval(x)))

In [10]:
# because some display as 1.4K
def convert_K_to_num(x):
    if 'K' in str(x):
        num = float(str(x[:-1])) * 1000
    elif pd.isnull(x):
        num = 0
    else:
        num = float(x)
    return num

In [16]:
insta1 = pd.read_csv('../data/Insta_Ben_Jerrys/Instagram Data Link2 part 1.csv')
insta2 = pd.read_csv('../data/Insta_Ben_Jerrys/Instagram Data Link2 part 2.csv')
insta3 = pd.read_csv('../data/Insta_Ben_Jerrys/Instagram Data Link2 part 3.csv')
insta4 = pd.read_csv('../data/Insta_Ben_Jerrys/Instagram Data Link2 part 4.csv')
benandjerrys = pd.concat([insta1, insta2, insta3, insta4], axis=0, ignore_index=True)

insta1 = pd.read_csv('../data/Insta_MissesVlog//Instagram Data Link3 part 1.csv')
insta2 = pd.read_csv('../data/Insta_MissesVlog/Instagram Data Link3 part 2.csv')
insta3 = pd.read_csv('../data/Insta_MissesVlog/Instagram Data Link3 part 3.csv')
insta4 = pd.read_csv('../data/Insta_MissesVlog/Instagram Data Link3 part 4.csv')
insta5 = pd.read_csv('../data/Insta_MissesVlog/Instagram Data Link3 part 5.csv')
insta6 = pd.read_csv('../data/Insta_MissesVlog/Instagram Data Link3 part 6.csv')
insta7 = pd.read_csv('../data/Insta_MissesVlog/Instagram Data Link3 part 7.csv')
missesvlog = pd.concat([insta1, insta2, insta3, insta4, insta5, insta6, insta7], axis=0, ignore_index=True)

insta1 = pd.read_csv('../data/Insta_Aminatabelli/Instagram Data Link3 part one.csv')
insta2 = pd.read_csv('../data/Insta_Aminatabelli/Instagram Data Link3 part two.csv')
insta3 = pd.read_csv('../data/Insta_Aminatabelli/Instagram Data Link3 part three.csv')
aminatabelli = pd.concat([insta1, insta2, insta3], axis=0, ignore_index=True)

insta1 = pd.read_csv('../data/Insta_Jung_Naiv/Instagram Data Link 1 part one.csv')
insta2 = pd.read_csv('../data/Insta_Jung_Naiv/Instagram Data Link 1 part two.csv')
jung_naiv = pd.concat([insta1, insta2], axis=0, ignore_index=True)

greenpeace = pd.read_csv('../data/Insta_Greenpeace.csv')
lemonaid = pd.read_csv('../data/Insta_Lemonaid.csv')
victorisvanviolence = pd.read_csv('../data/Insta_VictorisVanViolence.csv')

In [17]:
dfs = [benandjerrys, missesvlog, aminatabelli, jung_naiv, greenpeace, lemonaid, victorisvanviolence]
names = ['IG_BenandJerrys.csv', 'IG_MissesVlog.csv', 'IG_AminataBelli.csv', 'IG_jungundnaiv.csv', 'IG_Greenpeace.csv', 'IG_Lemonaid.csv', 'IG_Victoria.van.Violence.csv']

In [18]:
iterate = list(zip(dfs, names))

In [38]:
def preprocess_insta_data(insta, name):
    num_of_comments = insta.groupby('Post ID')['Post Comment'].count() - 1
    num_of_comments.name = 'comments'
    insta_df = insta.groupby('Post ID').head(1).set_index('Post ID')
    insta_df['likes'] = insta_df['Post likes'].str.replace(',', '').str.extract('([0-9]+)').astype(int)
    insta_df = insta_df.join(num_of_comments, how='left')
    insta_df = insta_df.reset_index()
    insta_df['success'] = insta_df['likes'] + 2*insta_df['comments']
    insta_df['tokens'] = insta_df['Post Comment'].astype(str).map(tokenizer.tokenize)
    insta_df = insta_df[['Post Comment','tokens', 'success', 'likes', 'comments', 'Post Time']]
    insta_df.columns = ['message', 'tokens', 'success', 'likes', 'comments', 'created_time']
    insta_word_to_idf = get_idf_map(insta_df['tokens'].map(lambda x: ' '.join(x)).values)
    insta_df['wv'] = insta_df['tokens'].map(lambda x: get_idf_weighted_average_vector(x, insta_word_to_idf, emb))
    insta_df = insta_df[~insta_df['wv'].isnull()]
    insta_df = insta_df.apply(lambda x: get_similarity_with_dimensions(x, dimensions), axis=1)
    insta_df['wv'] = insta_df['wv'].map(lambda x: str(list(x)))
    insta_df.to_csv('../processed/processed_' + name, index=False)

In [39]:
for df, name in iterate[3:]:
    print("Doing {}".format(name))
    preprocess_insta_data(df, name)

Doing IG_jungundnaiv.csv


  """
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Doing IG_Greenpeace.csv
Doing IG_Lemonaid.csv
Doing IG_Victoria.van.Violence.csv
