# Descriptive model using hours user wants to work versus the observed distribution among similar users

This is based off code from the eda. The predictive model performs very poorly so I'm turning to something a little more hands on.

# Libraries and Data Import

In [1]:
# Data Analysis and Modeling
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# Packages for PostgreSQL Import
import psycopg2
import os

# Packages for Extracting Embedding
import pickle  # Importing serialized model for prediction
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords  # Cleaning text data

In [25]:
# Data Import
dbname = "freelance_db"
username = os.environ['USER']
pswd = os.environ['SQLPSWD']

# Connect to SQL Database
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

# Importing Data
sql_query = """SELECT * FROM analysis_table_hours_worked;"""
dt = pd.read_sql_query(sql_query, con)

sql_query = """SELECT * FROM embeddings_table;"""
embeddings = pd.read_sql_query(sql_query, con)

# Merging Embeddings
embeddings = embeddings.drop(['index'], axis = 1)
dt = dt.drop(['index'], axis = 1)

dt = dt.merge(embeddings, how = "left", 
                                left_on = "profile_url", right_on= "profile_url")
dt = dt.fillna(0)

# Cleaning avg_word_length and bio_word count
dt.loc[dt['bio_length'] == 0, ['bio_word_count','avg_word_length']] = 0
dt = dt[dt.has_quote == 1]

dt.to_csv(os.environ['PWD'] + '/data/cleaned/users.csv',index=False)

# Functions

In [3]:
# Convert Bio to Embedding
def average_word_vectors(words, model, vocabulary, num_features):

    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                for tokenized_sentence in corpus]

    return np.mean(pd.DataFrame(np.array(features)), axis=0)

def get_word_embedding():
    # Loading Model
    try:
        filename = '/home/ubuntu/MyRate/scripts/model_w2v.sav'
        model_w2v = pickle.load(open(filename, 'rb'))
    except:
        filename = os.environ['PWD'] + '/scripts/models/model_w2v.sav'
        model_w2v = pickle.load(open(filename, 'rb'))

    tokenized_corpus = word_tokenize(bio)
    embeddings = averaged_word_vectorizer(corpus=tokenized_corpus, model=model_w2v,
                                          num_features=50)

    embeddings.index = [str(x) for x in list(embeddings.index)]
    embeddings = pd.DataFrame(embeddings).iloc[:, 0].to_dict()

    return embeddings

bio = "We are a group specializing in the creation of custom IT solutions for web, mobile, and desktop. We offer visually engaging and user-centric interactive solutions tailored to your business needs. We unify software development with digital marketing to transform the traditional into industry-changing solutions. We combine creative digital marketing with tailor-made software development and unique web design, to become the one-stop-shop for industry innovation. 1,000+ projects delivered 14+ years of experience transforming ideas into reality 280+ highly talented developers, designers, and marketers from around the world. Your digital originality. Crafted, coded and spread with passion. At Scopic our core services include:  Advanced Manufacturing Application Development FDA Compliant Scientific and Medical App Development Audio/Video App Development Financial App Development Communication and Collaboration App Development Crypto Currencies and Blockchain Development Services Machine Learning Solutions Conversational AI and Intelligent Assistance E-commerce Software Development SaaS Development"
test_embed = get_word_embedding()

  # Remove the CWD from sys.path while we load stuff.


In [4]:
# Creating Lookup Function

def get_my_rate(hours, skill_category, embedding):
    
    # Finding users within skill_category with similar embedding
    dt_search = dt[(dt.hours_worked_pr_mnth_pst_yr > (hours-5)) & (dt[skill_category] == 1) & (
        dt.hours_worked_pr_mnth_pst_yr < (hours+5))]
    
    new_embed_array = np.fromiter(test_embed.values(), dtype=float)
    data_embed_array = np.array(dt_search.iloc[:,list(range(18,68))])
    cos_sim = data_embed_array.dot(new_embed_array) / (np.linalg.norm(new_embed_array) * np.linalg.norm(data_embed_array, axis = 1))
    index_max = np.nanargmax(cos_sim)
    sim_user = dt_search.iloc[index_max,:].profile_url
    
    # Getting Rate Distribution
    rate_dist = dt[(dt.hours_worked_pr_mnth_pst_yr > (hours-5)) & (dt[skill_category] == 1) & (
        dt.hours_worked_pr_mnth_pst_yr < (hours+5))].hourly_rate.describe()
    
    med_rate = rate_dist[5]
    min_rate = rate_dist[3]
    max_rate = rate_dist[7]
    num_users = rate_dist[0]

    # Calculating Change for people outside of "success" window
    avg_decrease = (dt[(dt.hours_worked_pr_mnth_pst_yr < (hours-5)) & (
        dt[skill_category] == 1)].hourly_rate - med_rate).mean()

    avg_increase = (dt[(dt.hours_worked_pr_mnth_pst_yr > (hours+5)) & (
        dt[skill_category] == 1)].hourly_rate - med_rate).mean()
    
    # Printing Results
    print("We recommend an hourly rate of:", "$"+str(med_rate),"\n"
          "Of " + str(num_users) + " users - similar to you - working between " + str(hours-5) + " and " + str(hours+5) + " hours,\n" + 
          "The minimum hourly rate was", "$"+str(min_rate), "and the maximum was", "$"+str(max_rate)+".\n" +
          "URL of user similar to you based on your bio:",sim_user)
    
    return index_max

# Testing

In [5]:
bio = 'We are a group specializing in the creation of custom IT solutions for web, mobile, and desktop. We offer visually engaging and user-centric interactive solutions tailored to your business needs. We unify software development with digital marketing to transform the traditional into industry-changing solutions. We combine creative digital marketing with tailor-made software development and unique web design, to become the one-stop-shop for industry innovation. 1,000+ projects delivered 14+ years of experience transforming ideas into reality 280+ highly talented developers, designers, and marketers from around the world. Your digital originality. Crafted, coded and spread with passion. At Scopic our core services include:  Advanced Manufacturing Application Development FDA Compliant Scientific and Medical App Development Audio/Video App Development Financial App Development Communication and Collaboration App Development Crypto Currencies and Blockchain Development Services Machine Learning Solutions Conversational AI and Intelligent Assistance E-commerce Software Development SaaS DevelopmentOur highly skilled coders have experience with over 50 types of technologies including:•	Desktop: C++/Qt, .Net, Java, Ruby•	Web: React, Angular, Node.js, PHP/Laravel/Yii/Magento, .Net, Java, AWS Serverless Computing•	Mobile: React Native, Ionic, Xamarin•	Databases: SQL (MySQL, PostgreSQL, MS SQL and NoSQL),MongoDB, Dynamo, Firebase•	System and Software Architecture: Micro-services Architecture, Multi-tenant Architectures, Multi-tier Architecture•	DevOps: AWS, Google Cloud, CI/CD, Containers - Docker/Kubernetes•	Advanced Software Solutions: 3D/2D Graphics, Image Processing, AR/VR,Computer Vision, 3D Algorithmic (OpenCV, CUDA, Vuforia/Google VR/ARKit, Three.js, WebAssembly)Innovation starts with a powerful vision. Let’s work together to breathe life into your digital ideas.'
embed = get_word_embedding()
a = get_my_rate(5,'programming & development', test_embed)

We recommend an hourly rate of: $35.0 
Of 126.0 users - similar to you - working between 0 and 10 hours,
The minimum hourly rate was $5.0 and the maximum was $145.0.
URL of user similar to you based on your bio: https://www.guru.com/freelancers/calvin-tran-1


  # Remove the CWD from sys.path while we load stuff.


In [19]:
skill_categories = 'programming & development'
top_workers = dt[dt[skill_categories] ==
                     1]['hours_worked_pr_mnth_pst_yr'].nlargest(3).min()
top_workers

180.0

In [22]:
dt[(dt[skill_categories] == 1) &
        (dt['hours_worked_pr_mnth_pst_yr'] >= top_workers)].profile_url

11    https://www.guru.com/freelancers/eden-programm...
70       https://www.guru.com/freelancers/ken-b-1083572
81      https://www.guru.com/freelancers/kimberly-verdi
Name: profile_url, dtype: object