In [8]:
from tqdm.notebook import tqdm
import numpy as np
import re
import pandas as pd

# Data analysis (biography)

In [9]:
df = pd.read_parquet('data/text_data.parquet.gzip')

In [10]:
df['split_text'] = df['text'].apply(lambda x: re.split(' |\.|,|;|!|\?|\"|/|\(|\)|\[|]|&|:|-|\' |=', x.lower()))

In [11]:
df_word_counts = df['split_text'].explode().value_counts()
relevant_words = set(df_word_counts.loc[df_word_counts >= 50].index)
relevant_words

{'',
 '2',
 'a',
 'about',
 'all',
 'also',
 'always',
 'am',
 'amp',
 'an',
 'and',
 'any',
 'anything',
 'are',
 'area',
 'around',
 'as',
 'ask',
 'at',
 'back',
 'be',
 'beach',
 'because',
 'been',
 'being',
 'but',
 'by',
 'can',
 'chat',
 'click',
 'college',
 'cool',
 'currently',
 'do',
 "don't",
 'dont',
 'down',
 'easy',
 'else',
 'enjoy',
 'family',
 'few',
 'find',
 'for',
 'friend',
 'friends',
 'from',
 'fun',
 'funny',
 'games',
 'get',
 'girl',
 'girls',
 'go',
 'going',
 'good',
 'great',
 'guy',
 'guys',
 'hang',
 'have',
 'having',
 'hello',
 'here',
 'hey',
 'hi',
 'hit',
 'honest',
 'hot',
 'how',
 'i',
 "i'm",
 'if',
 'im',
 'in',
 'interested',
 'into',
 'is',
 'it',
 'just',
 'keep',
 'kids',
 'know',
 'laid',
 'laugh',
 'life',
 'like',
 'live',
 'living',
 'lol',
 'look',
 'looking',
 'love',
 'make',
 'man',
 'married',
 'maybe',
 'me',
 'meet',
 'meeting',
 'message',
 'more',
 'most',
 'movies',
 'much',
 'music',
 'my',
 'myself',
 'name',
 'never',
 'new

In [12]:
word_ratings_temp = {}
def word_to_rating(row):
    for word in row['split_text']:
        if word in word_ratings_temp:
            word_ratings_temp[word].append(row['avg_rating'])
        else:
            word_ratings_temp[word] = [row['avg_rating']]

In [13]:
test = df.apply(lambda x: word_to_rating(x), axis=1)

In [14]:
word_ratings = {'word': [], 'avg_rating': []}
for word, ratings in word_ratings_temp.items():
    if word in relevant_words:
        word_ratings['word'].append(word)
        word_ratings['avg_rating'].append(np.mean(ratings))
df_word_ratings = pd.DataFrame(word_ratings)