# Install packages and import files

In [1]:
import nltk, re, pprint
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np



from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

import seaborn as sns
import matplotlib as plt

In [2]:
profile=pd.read_excel('profiles_csv_updated_20191011.xlsx')

In [3]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 7 columns):
Name                                     118 non-null object
Gender                                   118 non-null object
All But Last Paragraph (professional)    118 non-null object
 Last Paragraph (personal)               117 non-null object
Cohort                                   118 non-null object
International                            117 non-null float64
full                                     118 non-null object
dtypes: float64(1), object(6)
memory usage: 6.5+ KB


In [3]:
#set name to be the index
profile.set_index('Name', inplace=True)


In [25]:
#a=profile.index.unique()

# First round of text cleaning

In [6]:
import re
import string

In [51]:
porter = nltk.stem.porter.PorterStemmer()

def clean_text_round1(text):
    #remove puncuations and breakers
    text=re.sub('\t',' ', text)
    text=re.sub('[%s]' % re.escape( string.punctuation ), '', text)
    text=re.sub('\n', ' ',text)
    text=porter.stem(text)
    return text


In [52]:
round1=lambda x:clean_text_round1(x)

In [53]:
#apply to the dataframe
data_clean=pd.DataFrame(profile.full.apply(round1))

In [54]:
data_clean.full.tail(50)

Name
Travis Noftle             having almost a decade of experience in invest...
Urbain Nounagnon          turning        challenges        into        o...
Krishna Patel             as an avid  butterfly enthusiast  throughout  ...
Payal Patel               payal is an ambitious adventurer who enjoys ex...
Samir Patel               one of  the most important  ties samir has  to...
Urmila Patil              urmila’s  best  qualities are analytical  thin...
Jackson Perry             team  is  everything  to  jackson  over  the  ...
Rachel Powell             after  many  rounds  of  experiments  rachel  ...
Alexander Preiss          sandy  thrives  at  the  intersection  of  dat...
Sunny\tQin                as  a  product  of  entrepreneurial  parents  ...
Lahari Revuri             lahari  explores  the  world  through  the  le...
Jonathan Rice             studying  human  population  genetics  for  th...
Benjamin Roberts          growing        up        in        a        sm...
Basil R

# Document-Term Matrix

In [55]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.full)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0_level_0,10,100,10000,1012,10k,11,1200,13,140page,15,...,youth,youtube,zach,zeal,zealous,zion,zip,ziplining,zone,zs
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Shiraz Ahmed,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Emily Ammons,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
David Andexler,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ryan Ankersen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rivers Baker,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Robert Bayer,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alison Berger,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Marshall Bradley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Price Burnett,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Jackson Cabell,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
data_dtm.shape

(118, 5084)

In [57]:
#transpose the dataframe
data = data_dtm.transpose()
data.head()

Name,Shiraz Ahmed,Emily Ammons,David Andexler,Ryan Ankersen,Rivers Baker,Robert Bayer,Alison Berger,Marshall Bradley,Price Burnett,Jackson Cabell,...,Faith Turner,Travis M. Walker,Meghan Weber,Ryan Weisner,Samantha Widman,Sarah Wotus,Lu Wu,Chesaney Wyse,Sharon Yao,Michael Zabawa
10,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10k,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## find the top 10 words for each person

In [58]:
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(10)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

{'Shiraz Ahmed': [('data', 6),
  ('shiraz', 4),
  ('foreign', 2),
  ('work', 2),
  ('middle', 2),
  ('analytics', 2),
  ('skin', 2),
  ('discrimination', 2),
  ('conclusions', 2),
  ('nations', 2)],
 'Emily Ammons': [('emily', 8),
  ('research', 3),
  ('community', 3),
  ('native', 3),
  ('study', 3),
  ('grass', 3),
  ('beta', 3),
  ('communication', 2),
  ('spatial', 2),
  ('biology', 2)],
 'David Andexler': [('david', 5),
  ('time', 4),
  ('student', 3),
  ('organizations', 2),
  ('marketing', 2),
  ('position', 2),
  ('strategies', 2),
  ('business', 2),
  ('understanding', 2),
  ('government', 2)],
 'Ryan Ankersen': [('ryan', 6),
  ('new', 4),
  ('challenges', 3),
  ('members', 2),
  ('learn', 2),
  ('event', 2),
  ('college', 2),
  ('team', 2),
  ('father', 2),
  ('accepted', 1)],
 'Rivers Baker': [('rivers', 6),
  ('business', 5),
  ('analytics', 4),
  ('data', 4),
  ('development', 3),
  ('collaboration', 2),
  ('biotechnology', 2),
  ('team', 2),
  ('plan', 2),
  ('solutions',

In [69]:

# make the top 10 words said by each person to a dict
top=[]
for name, top_words in top_dict.items():
    top.append({'name': name, 'words': ', '.join([word for word, count in top_words[0:9]])})


In [70]:
top=pd.DataFrame(top)

In [71]:
top.head()

Unnamed: 0,name,words
0,Shiraz Ahmed,"data, shiraz, foreign, work, middle, analytics..."
1,Emily Ammons,"emily, research, community, native, study, gra..."
2,David Andexler,"david, time, student, organizations, marketing..."
3,Ryan Ankersen,"ryan, new, challenges, members, learn, event, ..."
4,Rivers Baker,"rivers, business, analytics, data, development..."


In [76]:
display(top[top.name=='Lu Wu'])

Unnamed: 0,name,words
114,Lu Wu,"lu, student, communication, program, group, te..."


## Identify the non-zero items in the document-term matrix

In [60]:
# Identify the non-zero items in the document-term matrix, meaning that the word occurs at least once
unique_list = []
for name in data.columns:
    uniques = data[name].nonzero()[0].size
    unique_list.append(uniques)



  after removing the cwd from sys.path.


In [65]:
#unique_list

In [26]:
a

Index(['Shiraz Ahmed', 'Emily Ammons', 'David Andexler', 'Ryan Ankersen',
       'Rivers Baker', 'Robert Bayer', 'Alison Berger', 'Marshall Bradley',
       'Price Burnett', 'Jackson Cabell',
       ...
       'Faith Turner', 'Travis M. Walker', 'Meghan Weber', 'Ryan Weisner',
       'Samantha  Widman', 'Sarah Wotus ', 'Lu Wu', 'Chesaney Wyse ',
       'Sharon Yao', 'Michael Zabawa'],
      dtype='object', name='Name', length=118)

In [62]:
# Create a new dataframe that contains this unique word count
#a=profile.name.unique()
data_words = pd.DataFrame(list(zip(a, unique_list)), columns=['name', 'unique_words'])
data_unique_sort = data_words.sort_values(by='unique_words', ascending=False)
#data_unique_sort

In [64]:
data_unique_sort.to_csv(r'uniquewords_2.csv')

# lookup


In [6]:
profile_n=profile[(profile['International']==0) & (profile['Gender']=='M' )]

In [8]:
profile_n.index

Index(['Shiraz Ahmed', 'David Andexler', 'Ryan Ankersen', 'Robert Bayer',
       'Marshall Bradley', 'Price Burnett', 'Jackson Cabell',
       'Patrick Campbell', 'Dominick Carbone', 'Kyle Clapper', 'Grant Clark',
       'Jess Conner', 'Walter Creech', 'Emmanuel Daramola', 'Trevor Edge',
       'William Elmore', 'Maxwell Fairbairn', 'Andrew Francis',
       'Christopher Goodrich', 'Thomas Gow', 'Hirsh Gupta', 'Lance Hudson',
       'Nathan Jones', 'Shawn Kim', 'Grant King', 'Beruk Kiros', 'Taylor Kooy',
       'Samuel LaFell', 'Zach Lewis', 'Kevin Lybrand', 'Brett Lytle',
       'Preston Daniel MacDonald', 'Michael Marchetta', 'McKinnon Martin',
       'Kevin McDowell', 'Will Misenheimer', 'Travis Noftle',
       'Urbain Nounagnon', 'Samir Patel', 'Jackson Perry', 'Alexander Preiss',
       'Jonathan Rice', 'Benjamin Roberts', 'Basil Rodts',
       'Burton Scott Rudolph\n', 'Charlie Ruff\n', 'Sameen Salam',
       'Noah Woessner Sayre', 'Sam Scarpino', 'Austin Seals', 'Sufyan Shahin',


In [9]:
ss=['Ryan Ankersen', 'Robert Bayer','Price Burnett', 'Jackson Cabell',
       'Patrick Campbell','Kyle Clapper', 'Grant Clark',
       'Jess Conner', 'Walter Creech','Trevor Edge',
       'William Elmore', 'Maxwell Fairbairn', 'Thomas Gow','Lance Hudson', 'Zach Lewis', 'Kevin Lybrand', 'Brett Lytle',
       'Preston Daniel MacDonald', 'Michael Marchetta', 'McKinnon Martin',
       'Kevin McDowell', 'Will Misenheimer', 'Travis Noftle', 'Jackson Perry','Benjamin Roberts', 'Basil Rodts',
       'Burton Scott Rudolph', 'Charlie Ruff','Sam Scarpino',]

In [16]:
ss=profile_n[profile_n.index.isin(ss)]

In [19]:
profile_n.head()

Unnamed: 0_level_0,Gender,All But Last Paragraph (professional),Last Paragraph (personal),Cohort,International,full
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Shiraz Ahmed,M,"Known at home for asking many questions, Shira...","When not programming or analyzing data, Shiraz...",o,0.0,"Known at home for asking many questions, Shira..."
David Andexler,M,"From an early age, David capitalized on his in...",David has a passion for analysis and for gener...,o,0.0,"From an early age, David capitalized on his in..."
Ryan Ankersen,M,Ryan is the perfect combination of his highly ...,"In his leisure time, Ryan enjoys any hobby tha...",o,0.0,Ryan is the perfect combination of his highly ...
Robert Bayer,M,Robert’s work ethic and efficient time managem...,"When Robert is not working, he tries to take a...",o,0.0,Robert’s work ethic and efficient time managem...
Marshall Bradley,M,"From the time he was young, Marshall has held ...","Professionally, Marshall enjoys learning about...",o,0.0,"From the time he was young, Marshall has held ..."


In [18]:
ss['full']

Name
Ryan Ankersen               Ryan is the perfect combination of his highly ...
Robert Bayer                Robert’s work ethic and efficient time managem...
Price Burnett               Price has always been a meticulous planner, ta...
Jackson Cabell              The youngest of three, Jackson has always been...
Patrick Campbell            Leveraging over seven years of experience in t...
Kyle Clapper                Kyle discovered his eye for process improvemen...
Grant Clark                 Growing up, Grant was always fascinated with s...
Jess Conner                 Jess is an avid learner who enjoys engaging wi...
Walter Creech               Throughout all the seasons, Walter was constan...
Trevor Edge                 Through his love for sports, specifically base...
William Elmore              William’s love for solving complex problems ha...
Maxwell Fairbairn           While training as a jazz pianist during his un...
Thomas Gow                  Strategy and innovation became 