In [22]:
import pandas as pd
import numpy as np
from geotext import GeoText
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('wordnet')
import re
import string
pd.set_option('display.max_columns', 100)


stop_words = set(stopwords.words('english'))
stop_words = stop_words.union([str(x) for x in range(100)])
stop_words = stop_words.union(
    ["year", "old", ":", ")", "also", "-", "--", "n/a"])


def make_bag_of_words(df):
    df['bag_of_words'] = ''
    columns = df.columns
    for index, row in df.iterrows():
        words = ''
        for col in columns:
            if type(row[col]) != list:
                words = words + row[col] + ' '
            else:
                words = words + ' '.join(row[col]) + ' '
        row['bag_of_words'] = words

    df.drop(columns=[col for col in df.columns if col != 'bag_of_words'],
            inplace=True)
    
def get_preprocessed_text(text):
    words = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    words = [w.translate(table) for w in words]
    words = [word for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    snowball_stemmer = SnowballStemmer("english")
    lemmatizer = WordNetLemmatizer()
    words = [word for word in words if not word in stop_words]
    words = [snowball_stemmer.stem(word) for word in words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nthan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nthan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\nthan\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nthan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
mentor = pd.read_excel(
    'PayPal HackAthon - Mentor Application - Class 22 (Responses) .xlsx')

mentor_shortlisted = mentor.iloc[:, [11, 17, 18, 22, 23, 25, 27, 28, 29, 31]]
mentor_shortlisted.columns = [
    "gender", "title", "ethnic", "skills_hobbies", "loc_prefer", "group",
    "industry", "field_experience", "about_you", "fun_fact"
]
mentor_shortlisted = mentor.iloc[:, [11, 17, 22, 23, 25, 27, 28, 29, 31]]
mentor_shortlisted.columns = [
    "gender", "title", "skills_hobbies", "loc_prefer", "group", "industry",
    "field_experience", "about_you", "fun_fact"
]

mentor_shortlisted = mentor_shortlisted.fillna("")
mentor_shortlisted = mentor_shortlisted.replace("I prefer not to say", "")
mentor_shortlisted = mentor_shortlisted.replace("Other", "")

# for index, row in mentor_shortlisted.iterrows():
#     location = GeoText(row.loc["loc_prefer"])
#     row.loc["loc_prefer"] = [
#         x.replace(" ", "") for x in list(set(location.cities))
#     ]
#     row.loc["industry"] = re.split(",", row.loc["industry"])
#     row.loc["field_experience"] = re.split(",", row.loc["field_experience"])
#     row.loc["title"] = re.split(",", row.loc["title"])
#     fun = row["fun_fact"]
#     r = Rake(stopwords=stop_words)
#     r.extract_keywords_from_text(fun)
#     key_words_dict_scores = r.get_word_degrees()
#     row["fun_fact"] = list(key_words_dict_scores.keys())
#     about = row["about_you"]
#     r.extract_keywords_from_text(about)
#     key_words_dict_scores = r.get_word_degrees()
#     row["about_you"] = list(key_words_dict_scores.keys())
#     skill = row["skills_hobbies"]
#     r.extract_keywords_from_text(skill)
#     key_words_dict_scores = r.get_word_degrees()
#     row["skills_hobbies"] = list(key_words_dict_scores.keys())

In [29]:
mentor_shortlisted.to_csv("mentor.csv")

In [3]:
make_bag_of_words(mentor_shortlisted)

mentor_shortlisted["bag_of_words"] = mentor_shortlisted[
    "bag_of_words"].str.lower()
mentor = mentor_shortlisted

In [4]:
mentee = pd.read_excel(
    'PayPal HackAthon - YUBA SV Mentee Form Class 22 (Responses).xlsx')

mentee = mentee[[
    'In what city do you live?', 'Career field interest',
    'What industries interest you?',
    'What are you looking to gain from mentorship?',
    'What skills, interests, hobbies, or talents would you like to share with your mentor?',
    'What are the top 1-3 priorities you would like us to consider when matching you to a mentor?',
    'What would you want your mentor to know about you before you meet?'
]]

cities = mentee['In what city do you live?'].str.lower()
cities = [city.split(',')[0].replace(' ', '') for city in cities]

mentee['city'] = cities
mentee = mentee.drop(['In what city do you live?'], axis=1)

mentee = mentee.rename(
    columns={
        'Career field interest':
        'career_interest',
        'What industries interest you?':
        'industries',
        'What are you looking to gain from mentorship?':
        'gain',
        'What skills, interests, hobbies, or talents would you like to share with your mentor?':
        'skills_hobbies',
        'What would you want your mentor to know about you before you meet?':
        'about_you',
        'What are the top 1-3 priorities you would like us to consider when matching you to a mentor?':
        'priorities'
    })

mentee = mentee.replace(np.nan, '')

for col in mentee.drop(['city'], axis=1):
    k = []
    for row in mentee[col]:
        r = Rake(stopwords=stop_words)
        r.extract_keywords_from_text(row)

        key_words_dict_scores = r.get_word_degrees()

        k.append(list(key_words_dict_scores.keys()))
    mentee[col] = k
make_bag_of_words(mentee)
mentee["bag_of_words"] = mentee["bag_of_words"].str.lower()

In [5]:
for index, row in mentee.iterrows():
    row["bag_of_words"] = " ".join([word for word,pos in pos_tag(row["bag_of_words"].split()) if (pos == 'NN' or pos=='NNS')])
    row["bag_of_words"] = re.sub(r"[^\w ]"," ", row["bag_of_words"])
    row["bag_of_words"] = " ".join(get_preprocessed_text(row["bag_of_words"]))
for index, row in mentor.iterrows():
    row["bag_of_words"] = re.sub(r"[^\w ]"," ", row["bag_of_words"])
    row["bag_of_words"] = " ".join([word for word,pos in pos_tag(row["bag_of_words"].split()) if (pos == 'NN' or pos=='NNS')])
    row["bag_of_words"] = " ".join(get_preprocessed_text(row["bag_of_words"]))

In [6]:
count = TfidfVectorizer(strip_accents='unicode',
                        stop_words='english',
                        max_df=0.7,
                        sublinear_tf=True)
count.fit(pd.concat([mentor['bag_of_words'], mentee['bag_of_words']]))
mentor_cm = count.transform(mentor['bag_of_words'])
mentee_cm = count.transform(mentee['bag_of_words'])

In [13]:
count.idf_

array([5.12390336, 6.0401941 , 6.0401941 , ..., 6.0401941 , 6.0401941 ,
       5.63472899])

In [7]:
cosine_sim_mentor = cosine_similarity(X=mentor_cm, Y=mentee_cm)

recommended = []
id_mentee = 10

# creating a Series with the similarity scores in descending order
score_series = pd.Series(
    cosine_sim_mentor[:,id_mentee]).sort_values(ascending=False)

# getting the indexes of the 10 most similar movies
top_10_indexes = list(score_series.iloc[0:11].index)

In [8]:
score_series

6      0.162310
216    0.137749
165    0.135221
79     0.132697
233    0.129273
         ...   
63     0.000000
142    0.000000
65     0.000000
138    0.000000
120    0.000000
Length: 241, dtype: float64

In [9]:
top_10_indexes

[6, 216, 165, 79, 233, 201, 227, 238, 35, 13, 240]

In [10]:
mentor_ori = pd.read_excel(
    'PayPal HackAthon - Mentor Application - Class 22 (Responses) .xlsx')
mentee_ori = pd.read_excel(
    'PayPal HackAthon - YUBA SV Mentee Form Class 22 (Responses).xlsx')

In [11]:
mentee.iloc[[id_mentee]].values

array([['softwar develop web technolog music art advic gain guidanc willi step support someon mentorship see leader music interest interest food hobbi travel restaraunt talent share mentor skill food python plan time guidanc non grit experi work shi way use confid bit goal trust relationshiop techniqu time other brand lot anyth guidenc person someon understand someth learn moment conflict resolut way sanjos']],
      dtype=object)

In [21]:
mentee_ori.iloc[[10]]

Unnamed: 0,Timestamp,Full Name,What is your age?,Your Year Up email address?,What LC are you a member of?,Learning Track,Are you interested in joining the Mentor Program?,"If you answered NO, I am not interested in joining the Mentor Program, please briefly explain why?",Best phone number to reach you at:,In what city do you live?,Do you have access to a car?,Career field interest,What industries interest you?,Are you currently a parent or caretaker to someone?,What are you looking to gain from mentorship?,"Please list the neighborhoods, areas of the city, and suburbs that are convenient locations for you to meet with your mentor.","Are there any communities/groups that you are a part of or are passionate about that you want your mentor to know? (Ex: LGBQT, Religious/Faith, etc.)","What skills, interests, hobbies, or talents would you like to share with your mentor?",Do you foresee any challenges that would make it difficult to meet with your mentor?,What are the top 1-3 priorities you would like us to consider when matching you to a mentor?,What would you want your mentor to know about you before you meet?,Please share the link to your LinkedIn profile.
10,2019-07-14 14:32:51.702,Alicia Tran,23,atran000@sfo.yearup.org,LC Courage,Data,"Yes, sign me up!",,(510)-241-9275,San Jose,Yes,Software development and Web development,"Technology, Music, and Art",No,What I am learning to gain from mentorship is ...,Year Up Site in San Jose,"Outdoor Groups, Art Communities, Music Community","Some skills, interests, hobbies, or talents I ...",No,"Non-biased, open minded, and empathetic but at...",My learning techniques are different from othe...,https://www.linkedin.com/in/alicia-tran-645560...


In [12]:
mentor.iloc[top_10_indexes].values

array([['chief staff time lay term idea pictur hobbi educ skill talent sustain initi manag process develop cross convers question write listen project conflict resolut menlopark paloalto biotechnolog manufactur consult technolog servic technolog project manag data analyt time cat act'],
       ['compens manag board game confid cultur rescu puppi spca travel food spot redwoodc sancarlo servic profession consult technolog servic technolog project manag data analyt colleagu strive success friend thing listen someon defin mente other self mentor har confid excel consult dream job leader milk love anim impact organ psycholog day rescu dog food'],
       ['product strategi oper augment love chicago excit eye workforc provid experi product educ busi side opportun mente time avid basketbal player industri way problem share engin manag consult experi tech industri hip hop music fan technolog thing consum healthcar biotechnolog manufactur consult technolog servic technolog project manag data ana

In [16]:
mentee_ori.T.to_json()

'{"0":{"Timestamp":1562858363749,"Full Name":"Agustin Cortes","What is your age?":21,"Your Year Up email address?":"acortes@sfo.yearup.org","What LC are you a member of?":"LC Courage","Learning Track":"Project Management","Are you interested in joining the Mentor Program?":"Yes, sign me up!","If you answered NO, I am not interested in joining the Mentor Program, please briefly explain why? ":null,"Best phone number to reach you at:":8318017220,"In what city do you live?":"Hollister","Do you have access to a car?":"Yes","Career field interest":"Entrepreneurship + Tech","What industries interest you?":"Tech, neurology, psychology, marketing, E-commerce","Are you currently a parent or caretaker to someone?":"No","What are you looking to gain from mentorship?":"Extremely competent individual to share wisdom and keep me accountable","Please list the neighborhoods, areas of the city, and suburbs that are convenient locations for you to meet with your mentor.":"I can come to San Jose. Meeting

In [18]:
from pymongo import MongoClient
client = MongoClient()

In [19]:
client

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)