In [12]:
import pandas as pd
import _pickle as pickle
import numpy as np
from scipy.stats import halfnorm

In [19]:
with open(".\\data\\profiles.pkl",'rb') as fp:
    df = pickle.load(fp)
df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble ...,5,3,4,1,3,6,7
1,Web junkie. Analyst. Infuriatingly humble intr...,7,9,5,1,9,4,0
2,Avid web maven. Food practitioner. Gamer. Twit...,1,2,6,5,6,5,4
3,Twitteraholic. Extreme web fanatic. Food buff....,5,2,7,8,2,6,6
4,Bacon enthusiast. Falls down a lot. Freelance ...,6,6,6,4,3,6,3


In [20]:
# Probability dictionary
p = {}

# Movie Genres
movies = ['Adventure',
          'Action',
          'Drama',
          'Comedy',
          'Thriller',
          'Horror',
          'RomCom',
          'Musical',
          'Documentary']

p['Movies'] = [0.28,
               0.21,
               0.16,
               0.14,
               0.09,
               0.06,
               0.04,
               0.01, 
               0.01]

# Religions (could potentially create a spectrum)
religion = ['Catholic',
            'Christian',
            'Jewish',
            'Muslim',
            'Hindu',
            'Buddhist',
            'Spiritual',
            'Other',
            'Agnostic',
            'Atheist']

p['Religion'] = [0.16,
                 0.16,
                 0.01,
                 0.19,
                 0.11,
                 0.05,
                 0.10,
                 0.09,
                 0.07,
                 0.06]

# Music
music = ['Rock',
         'HipHop',
         'Pop',
         'Country',
         'Latin',
         'EDM',
         'Gospel',
         'Jazz',
         'Classical']

p['Music'] = [0.30,
              0.23,
              0.20,
              0.10,
              0.06,
              0.04,
              0.03,
              0.02,
              0.02]

# Sports
sports = ['Football',
          'Baseball',
          'Basketball',
          'Hockey',
          'Soccer',
          'Other']

p['Sports'] = [0.34,
               0.30,
               0.16, 
               0.13,
               0.04,
               0.03]

# Politics (could also put on a spectrum)
politics = ['Liberal',
            'Progressive',
            'Centrist',
            'Moderate',
            'Conservative']

p['Politics'] = [0.26,
                 0.11,
                 0.11,
                 0.15,
                 0.37]

# Social Media
social = ['Facebook',
          'Youtube',
          'Twitter',
          'Reddit',
          'Instagram',
          'Pinterest',
          'LinkedIn',
          'SnapChat',
          'TikTok']

p['Social Media'] = [0.36,
                     0.27,
                     0.11,
                     0.09,
                     0.05,
                     0.03,
                     0.03,
                     0.03,
                     0.03]


age = halfnorm.rvs(loc=18,scale=8, size=df.shape[0]).astype(int)


In [21]:

categories = [movies, religion, music, politics, social, sports, age]
names = ['Movies','Religion', 'Music', 'Politics', 'Social Media', 'Sports', 'Age']
combined = dict(zip(names, categories))


In [31]:
with open(".\\data\\refined_profiles.pkl",'rb') as fp:
    refined_df = pickle.load(fp)
refined_df.head()

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Age
0,Typical twitter fanatic. Infuriatingly humble ...,"[Thriller, Action]",Buddhist,"[Gospel, Pop, Rock]",Progressive,"[Twitter, Reddit, Instagram]","[Basketball, Football, Hockey]",19
1,Web junkie. Analyst. Infuriatingly humble intr...,"[Comedy, Drama]",Spiritual,"[HipHop, Pop]",Conservative,"[Twitter, Facebook]","[Basketball, Soccer, Hockey]",19
2,Avid web maven. Food practitioner. Gamer. Twit...,"[Horror, Comedy]",Christian,"[HipHop, Rock]",Moderate,"[Reddit, Youtube]","[Baseball, Football]",21
3,Twitteraholic. Extreme web fanatic. Food buff....,"[Adventure, Comedy]",Agnostic,[Rock],Moderate,"[Facebook, Youtube]","[Basketball, Football, Hockey]",25
4,Bacon enthusiast. Falls down a lot. Freelance ...,"[Action, Adventure, Drama]",Hindu,"[Latin, Rock, HipHop]",Conservative,"[Twitter, TikTok, Youtube]","[Hockey, Football, Basketball]",19


In [43]:
list(df.columns)[1:]

['Movies', 'Religion', 'Music', 'Politics', 'Social Media', 'Sports', 'Age']

In [44]:
list(combined.keys())

['Movies', 'Religion', 'Music', 'Politics', 'Social Media', 'Sports', 'Age']

In [51]:
def get_input(bio='',movies_list='', religion='', music_list='', politics='', social_media_list='', sports_list='', age = 0):
    return (bio, (movies_list, religion, music_list, politics, social_media_list, sports_list, age))
bio, categories = get_input()
len(categories)

7

In [54]:
def get_profile():
    bio, categories = get_input()
    profile = pd.DataFrame(columns=df.columns, index=[df.index[-1] + 1])
    profile['Bios'] = bio
    
    for i, name in enumerate(list(combined.keys())):
        profile[name] = categories[i]
    return profile

profile = get_profile()
profile.columns

Index(['Bios', 'Movies', 'Religion', 'Music', 'Politics', 'Social Media',
       'Sports', 'Age'],
      dtype='object')

In [None]:

with open('..\\data\\combined.pkl', 'wb') as f:
    pickle.dump(combined, f)
with open('..\\data\\probability.pkl', 'wb') as f:
    pickle.dump(p, f)


In [8]:
combined

{'Movies': ['Adventure',
  'Action',
  'Drama',
  'Comedy',
  'Thriller',
  'Horror',
  'RomCom',
  'Musical',
  'Documentary'],
 'Religion': ['Catholic',
  'Christian',
  'Jewish',
  'Muslim',
  'Hindu',
  'Buddhist',
  'Spiritual',
  'Other',
  'Agnostic',
  'Atheist'],
 'Music': ['Rock',
  'HipHop',
  'Pop',
  'Country',
  'Latin',
  'EDM',
  'Gospel',
  'Jazz',
  'Classical'],
 'Politics': ['Liberal',
  'Progressive',
  'Centrist',
  'Moderate',
  'Conservative'],
 'Social Media': ['Facebook',
  'Youtube',
  'Twitter',
  'Reddit',
  'Instagram',
  'Pinterest',
  'LinkedIn',
  'SnapChat',
  'TikTok'],
 'Sports': ['Football', 'Baseball', 'Basketball', 'Hockey', 'Soccer', 'Other'],
 'Age': array([24, 26, 19, ..., 28, 20, 18], shape=(6600,))}

In [None]:
# religion, politics have only one value
# age is decided from half normal distribution
# rest can have max 3 choices
for name, category in combined.items():
    if name in ['Religion', 'Politics']:
        df[name] = np.random.choice(category, df.shape[0], p=p[name])  
    elif name == 'Age':
        df[name] = category
    else:
        try:
            # randomly choose strings from category, with size df rows, 1, 3, and probability p
            df[name] = list(np.random.choice(category, size=(df.shape[0],1,3), p=p[name]))
        except:
            df[name] = list(np.random.choice(category, size=(df.shape[0],1,3)))
        
        df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = list(np.random.choice(category, size=(df.shape[0],1,3), p=p[name]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = np.random.choice(category, df.shape[0], p=p[name])
A 

In [30]:
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Age
0,Typical twitter fanatic. Infuriatingly humble ...,"[Drama, Adventure, Action]",Other,"[Pop, EDM]",Progressive,"[Pinterest, Facebook, Instagram]","[Football, Other, Hockey]",23
1,Web junkie. Analyst. Infuriatingly humble intr...,"[Adventure, Thriller]",Hindu,"[Pop, Rock]",Progressive,[Facebook],[Baseball],18
2,Avid web maven. Food practitioner. Gamer. Twit...,"[Drama, Adventure, Action]",Muslim,"[Country, Rock]",Conservative,"[Twitter, Youtube, Facebook]","[Basketball, Baseball, Hockey]",29
3,Twitteraholic. Extreme web fanatic. Food buff....,"[Adventure, Action]",Catholic,"[Country, HipHop]",Moderate,"[Twitter, Youtube]","[Football, Baseball, Basketball]",25
4,Bacon enthusiast. Falls down a lot. Freelance ...,"[Adventure, Action, Comedy]",Hindu,"[Pop, Rock]",Conservative,"[Youtube, Facebook]","[Football, Baseball, Hockey]",19
...,...,...,...,...,...,...,...,...
6595,Typical pop culture nerd. Infuriatingly humble...,"[Drama, Adventure]",Catholic,"[HipHop, Pop, Classical]",Liberal,"[Youtube, Facebook]","[Football, Baseball, Basketball]",24
6596,Avid web junkie. Lifelong alcohol guru. Hardco...,"[Drama, Adventure, Musical]",Spiritual,"[Country, HipHop, Rock]",Conservative,"[Youtube, SnapChat]","[Basketball, Hockey]",28
6597,Music ninja. Bacon fanatic. Reader. Total comm...,"[Drama, Action, Comedy]",Atheist,"[Country, Pop]",Conservative,"[Reddit, Facebook, TikTok]",[Football],19
6598,Communicator. Bacon lover. Award-winning intro...,"[Horror, Comedy]",Other,[HipHop],Conservative,[Facebook],"[Football, Soccer, Basketball]",18


In [31]:
df['Religion'] = pd.Categorical(df.Religion, ordered=True,
                                categories=['Catholic',
                                            'Christian',
                                            'Jewish',
                                            'Muslim',
                                            'Hindu',
                                            'Buddhist',
                                            'Spiritual',
                                            'Other',
                                            'Agnostic',
                                            'Atheist'])

df['Politics'] = pd.Categorical(df.Politics, ordered=True,
                                categories=['Liberal',
                                            'Progressive',
                                            'Centrist',
                                            'Moderate',
                                            'Conservative'])

In [32]:
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Age
0,Typical twitter fanatic. Infuriatingly humble ...,"[Drama, Adventure, Action]",Other,"[Pop, EDM]",Progressive,"[Pinterest, Facebook, Instagram]","[Football, Other, Hockey]",23
1,Web junkie. Analyst. Infuriatingly humble intr...,"[Adventure, Thriller]",Hindu,"[Pop, Rock]",Progressive,[Facebook],[Baseball],18
2,Avid web maven. Food practitioner. Gamer. Twit...,"[Drama, Adventure, Action]",Muslim,"[Country, Rock]",Conservative,"[Twitter, Youtube, Facebook]","[Basketball, Baseball, Hockey]",29
3,Twitteraholic. Extreme web fanatic. Food buff....,"[Adventure, Action]",Catholic,"[Country, HipHop]",Moderate,"[Twitter, Youtube]","[Football, Baseball, Basketball]",25
4,Bacon enthusiast. Falls down a lot. Freelance ...,"[Adventure, Action, Comedy]",Hindu,"[Pop, Rock]",Conservative,"[Youtube, Facebook]","[Football, Baseball, Hockey]",19
...,...,...,...,...,...,...,...,...
6595,Typical pop culture nerd. Infuriatingly humble...,"[Drama, Adventure]",Catholic,"[HipHop, Pop, Classical]",Liberal,"[Youtube, Facebook]","[Football, Baseball, Basketball]",24
6596,Avid web junkie. Lifelong alcohol guru. Hardco...,"[Drama, Adventure, Musical]",Spiritual,"[Country, HipHop, Rock]",Conservative,"[Youtube, SnapChat]","[Basketball, Hockey]",28
6597,Music ninja. Bacon fanatic. Reader. Total comm...,"[Drama, Action, Comedy]",Atheist,"[Country, Pop]",Conservative,"[Reddit, Facebook, TikTok]",[Football],19
6598,Communicator. Bacon lover. Award-winning intro...,"[Horror, Comedy]",Other,[HipHop],Conservative,[Facebook],"[Football, Soccer, Basketball]",18


In [33]:
with open("refined_profiles.pkl",'wb') as fp:
    pickle.dump(df, fp)

In [43]:
with open("refined_profiles.pkl",'rb') as fp:
    df = pickle.load(fp)
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Age
0,Typical twitter fanatic. Infuriatingly humble ...,"[Drama, Adventure, Action]",Other,"[Pop, EDM]",Progressive,"[Pinterest, Facebook, Instagram]","[Football, Other, Hockey]",23
1,Web junkie. Analyst. Infuriatingly humble intr...,"[Adventure, Thriller]",Hindu,"[Pop, Rock]",Progressive,[Facebook],[Baseball],18
2,Avid web maven. Food practitioner. Gamer. Twit...,"[Drama, Adventure, Action]",Muslim,"[Country, Rock]",Conservative,"[Twitter, Youtube, Facebook]","[Basketball, Baseball, Hockey]",29
3,Twitteraholic. Extreme web fanatic. Food buff....,"[Adventure, Action]",Catholic,"[Country, HipHop]",Moderate,"[Twitter, Youtube]","[Football, Baseball, Basketball]",25
4,Bacon enthusiast. Falls down a lot. Freelance ...,"[Adventure, Action, Comedy]",Hindu,"[Pop, Rock]",Conservative,"[Youtube, Facebook]","[Football, Baseball, Hockey]",19
...,...,...,...,...,...,...,...,...
6595,Typical pop culture nerd. Infuriatingly humble...,"[Drama, Adventure]",Catholic,"[HipHop, Pop, Classical]",Liberal,"[Youtube, Facebook]","[Football, Baseball, Basketball]",24
6596,Avid web junkie. Lifelong alcohol guru. Hardco...,"[Drama, Adventure, Musical]",Spiritual,"[Country, HipHop, Rock]",Conservative,"[Youtube, SnapChat]","[Basketball, Hockey]",28
6597,Music ninja. Bacon fanatic. Reader. Total comm...,"[Drama, Action, Comedy]",Atheist,"[Country, Pop]",Conservative,"[Reddit, Facebook, TikTok]",[Football],19
6598,Communicator. Bacon lover. Award-winning intro...,"[Horror, Comedy]",Other,[HipHop],Conservative,[Facebook],"[Football, Soccer, Basketball]",18


In [44]:
df.to_csv("refined_profiles.csv", index=False)


In [45]:
df = pd.read_csv('refined_profiles.csv')
df

Unnamed: 0,Bios,Movies,Religion,Music,Politics,Social Media,Sports,Age
0,Typical twitter fanatic. Infuriatingly humble ...,"['Drama', 'Adventure', 'Action']",Other,"['Pop', 'EDM']",Progressive,"['Pinterest', 'Facebook', 'Instagram']","['Football', 'Other', 'Hockey']",23
1,Web junkie. Analyst. Infuriatingly humble intr...,"['Adventure', 'Thriller']",Hindu,"['Pop', 'Rock']",Progressive,['Facebook'],['Baseball'],18
2,Avid web maven. Food practitioner. Gamer. Twit...,"['Drama', 'Adventure', 'Action']",Muslim,"['Country', 'Rock']",Conservative,"['Twitter', 'Youtube', 'Facebook']","['Basketball', 'Baseball', 'Hockey']",29
3,Twitteraholic. Extreme web fanatic. Food buff....,"['Adventure', 'Action']",Catholic,"['Country', 'HipHop']",Moderate,"['Twitter', 'Youtube']","['Football', 'Baseball', 'Basketball']",25
4,Bacon enthusiast. Falls down a lot. Freelance ...,"['Adventure', 'Action', 'Comedy']",Hindu,"['Pop', 'Rock']",Conservative,"['Youtube', 'Facebook']","['Football', 'Baseball', 'Hockey']",19
...,...,...,...,...,...,...,...,...
6595,Typical pop culture nerd. Infuriatingly humble...,"['Drama', 'Adventure']",Catholic,"['HipHop', 'Pop', 'Classical']",Liberal,"['Youtube', 'Facebook']","['Football', 'Baseball', 'Basketball']",24
6596,Avid web junkie. Lifelong alcohol guru. Hardco...,"['Drama', 'Adventure', 'Musical']",Spiritual,"['Country', 'HipHop', 'Rock']",Conservative,"['Youtube', 'SnapChat']","['Basketball', 'Hockey']",28
6597,Music ninja. Bacon fanatic. Reader. Total comm...,"['Drama', 'Action', 'Comedy']",Atheist,"['Country', 'Pop']",Conservative,"['Reddit', 'Facebook', 'TikTok']",['Football'],19
6598,Communicator. Bacon lover. Award-winning intro...,"['Horror', 'Comedy']",Other,['HipHop'],Conservative,['Facebook'],"['Football', 'Soccer', 'Basketball']",18
