In [1]:
from joblib import load
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
import streamlit as st
import _pickle as pickle
from random import sample
from PIL import Image
from scipy.stats import halfnorm


In [3]:
with open('..\\data\\refined_profiles.pkl', 'rb') as fp:
    df = pickle.load(fp)
with open('..\\data\\refined_cluster.pkl', 'rb') as fp:
    clustered_df = pickle.load(fp)
with open('..\\data\\vectorized_refined.pkl', 'rb') as fp:
    vect_df = pickle.load(fp)

In [4]:
model = load('..\\data\\refined_model.joblib')

In [5]:
def to_string(preference):
    if isinstance(preference, list):
        return ' '.join(preference)
    else:
        return preference

In [6]:
def vectorize(df, columns):
    # select the first column in database
    column_name = columns[0]

    # base condition, went through all columns
    if column_name not in ['Bios', 'Movies','Religion', 'Music', 'Politics', 'Social Media', 'Sports']:
        return df
        
    # they are stored as categorical values so can simply use .cat.codes
    if column_name in ['Religion','Politics']:
        df[column_name.lower()] = df[column_name].cat.codes
        df = df.drop(column_name, axis=1)
        return vectorize(df, df.columns)
   
    else:
        vectorizer = CountVectorizer()
        # first one Bios
        vec = vectorizer.fit_transform(df[column_name])
        # contains vectorized words
        df_words = pd.DataFrame(vec.toarray(), columns=vectorizer.get_feature_names_out())
        new_df = pd.concat([df, df_words], axis=1)
        # dropping original column
        new_df = new_df.drop(column_name, axis=1)
    
        return vectorize(new_df, new_df.columns)

In [7]:
def scaling(df, input_df):
    scaler = MinMaxScaler()
    scaler.fit(df)
    input_vect = pd.DataFrame(scaler.transform(input_df), 
                              index=input_df.index, 
                              columns=input_df.columns)
    return input_vect

In [None]:
def top_ten(cluster, vect_df, input_vect):
    # filtering clustered data and appending new profile
    des_cluster = vect_df[vect_df['Cluster #'] == cluster[0]].drop(columns=['Cluster #'])
    des_cluster = pd.concat([des_cluster, input_vect], ignore_index=False, sort=False)
    
    # finding top 10 similar, correlated users in the cluster
    user_n = input_vect.index[0]
    corr = des_cluster.T.corrwith(des_cluster.log[user_n])
    top_10_sim = corr.sort_values(ascending=False)[1:11]        # excluding the user
    
    top_10 = df.loc[top_10_sim.index]
    top_10[top_10.columns[1:]] = top_10[top_10.columns[1:]].astype(int)
    return top_10.astype('object')
    

In [None]:
def example_bios():
    st.write('-' * 100)
    st.text('Example Bios')
    for i in sample(list(df.index), 3):
        st.text(df['Bios'].loc[i])
    st.write('-' * 100)

In [None]:
def load_categories():
    with open('..\\data\\categories.pkl', 'rb') as f:
        combined = pickle.load(f)
    with open('..\\data\\probability.pkl', 'rb') as f:
        p = pickle.load(f)
    return combined, p

In [None]:
def frontend():
    st.title('Vibe Check')
    
    st.header('Find Peeps with simillar Vibes')
    st.write("using Machine Learning")
    image = Image.open('..\\data\\vibecheck.jpeg')
    st.image(image, use_column_width=True)
    
    new_profile = pd.DataFrame(columns=df.columns, index=[df.index[-1] + 1])
    new_profile['Bios'] = st.text_input("Enter a Bio for yourself")