# Goals

1. Detect frequency of keyword appearences in bodies of text, looking at ngrams of lengths 1 and 2 in their respective questions. One set of checks will be for exact match, and the other will be for a stem-match (e.g., book <-> booking). 

2. Create keyword-keyword edgelist for each category for Gephi visualization.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import ngrams
from nltk.stem.wordnet import WordNetLemmatizer
from pattern.en import conjugate

import re
from collections import Counter
from collections import defaultdict


In [2]:
#load data from "cleaning 00"
sorted_keyword_counts = pd.DataFrame.from_csv('sorted_keyword_counts.csv')
data = pd.DataFrame.from_csv('data_00.csv')

In [5]:
categories = ['diy.csv',
              'cooking.csv',
              'biology.csv', 
              'crypto.csv',
              'robotics.csv',
              'travel.csv']

#lemmatizer used for getting roots of verbs
lemmatizer = WordNetLemmatizer()
assert lemmatizer.lemmatize('hating','v') == 'hat'

def lemmatize(x):
    '''lazy implementation of getting word roots that should cover most verbs/nouns'''
    x = lemmatizer.lemmatize(x,'v')
    x = lemmatizer.lemmatize(x)
    return x

def lemmatize_n(x,split_char='-'):
    vals = x.split(split_char)
    return {split_char.join([lemmatize(v) for v in vals]):x}

def lemmatize_list(x, join_char='-'):
    return join_char.join([lemmatize(e) for e in x])

#this will store a counter of "hit"/"miss" for each keyword in each category
keyword_matching = {key:defaultdict(Counter) for key in categories}
#this will store a counter of "hit"/"miss" for each keyword in each category, but using word stems for broading matching
keyword_matching_broad = {key:defaultdict(Counter) for key in categories}
#this will store counts for keyword FrozenSets (like tuple but unordered)
keyword_pairs = {key:Counter() for key in categories}

In [10]:
punctuation_regex = r'[ .,?;:*()![\]\-]+'
for i, row in data.iterrows():
    category = row['source']
    keywords = row['tags'].split(' ')
    #separate keywords into their lengths
    l1_keywords ={keyword:False for keyword in keywords if keyword.count('-')==0}
    l2_keywords = {keyword:False for keyword in keywords if keyword.count('-')==1}
    #keyword matching
    #1-grams
    for gram in ngrams(re.split(punctuation_regex, row['fulltext']), 1):
        if gram[0] in l1_keywords:
            l1_keywords[gram[0]] = True
    #2-grams
    for gram in ngrams(re.split(punctuation_regex, row['fulltext']), 2):
        joined_gram = '-'.join([gram[0],gram[1]])
        if joined_gram in l2_keywords:
            l2_keywords[joined_gram] = True
    
    for keyword, value in l1_keywords.iteritems():
        if value:
            keyword_matching[category][keyword]['hit'] += 1
        else:
            keyword_matching[category][keyword]['miss'] += 1
            
    for keyword, value in l2_keywords.iteritems():
        if value:
            keyword_matching[category][keyword]['hit'] += 1
        else:
            keyword_matching[category][keyword]['miss'] += 1
    #keyword stem matching
    l1_stems = {lemmatize(keyword):keyword for keyword in keywords if keyword.count('-')==0}
    l2_stems = {}
    l1_keywords_stemmed = {keyword:False for keyword in keywords if keyword.count('-')==0}
    l2_keywords_stemmed = {keyword:False for keyword in keywords if keyword.count('-')==1}
    for keyword in keywords:
        if keyword.count('-') <> 1:
            continue
        l2_stems.update(lemmatize_n(keyword))
    #1-grams
    for gram in ngrams(re.split(punctuation_regex, row['fulltext']), 1):
        lemma = lemmatize(gram[0])
        if lemma in l1_stems:
            l1_keywords_stemmed[l1_stems[lemma]] = True  
    #2-grams
    for gram in ngrams(re.split(punctuation_regex, row['fulltext']), 2):
        lemma = lemmatize_list(gram)
        if lemma in l2_stems:
            l2_keywords_stemmed[l2_stems[lemma]] = True
            
    for keyword, value in l1_keywords_stemmed.iteritems():
        if value:
            keyword_matching_broad[category][keyword]['hit'] += 1
        else:
            keyword_matching_broad[category][keyword]['miss'] += 1
            
    for keyword, value in l2_keywords_stemmed.iteritems():
        if value:
            keyword_matching_broad[category][keyword]['hit'] += 1
        else:
            keyword_matching_broad[category][keyword]['miss'] += 1
    #keyword pairs
    for ki, keyword_1 in enumerate(keywords[:len(keywords)-1]):
        for kj, keyword_2 in enumerate(keywords[ki+1:]):
            keyword_pairs[category][frozenset([keyword_1, keyword_2])] += 1
    

Now that all the information has been extracted, there are a few things left:

1. Create data frames for the keyword hits & misses, and derive a percentage from the two columns

2. Plot some of the data from (2) to give an idea of accuracies and frequencies

3. Create an edgelist from the dict of frozensets, as well as a nodelist containing the source. Plot in Gephi separately.

4. (Possibly later) join accuracies from the data frames in (1) to the nodelist in (3) for some more possibilities in Gephi.

In [23]:
def addcol(varname, value, x):
    x[varname] = value
    return x

keyword_match_df = pd.concat([addcol('source',source,
                        pd.DataFrame.from_dict(matches, orient='index'))  
                        for source, matches in keyword_matching.iteritems()])

keyword_match_stem_df = pd.concat([addcol('source',source,
                        pd.DataFrame.from_dict(matches, orient='index'))  
                        for source, matches in keyword_matching_broad.iteritems()])

In [24]:
keyword_match_df

Unnamed: 0,hit,miss,source
120-240v,,17.0,diy.csv
240v,34.0,35.0,diy.csv
abs,4.0,4.0,diy.csv
access-panel,2.0,4.0,diy.csv
accessibility,,3.0,diy.csv
acid,,1.0,diy.csv
acoustic,2.0,13.0,diy.csv
acrylic,9.0,4.0,diy.csv
addition,1.0,1.0,diy.csv
adhesive,27.0,117.0,diy.csv


In [25]:
keyword_match_stem_df

Unnamed: 0,hit,miss,source
120-240v,,17.0,diy.csv
240v,34.0,35.0,diy.csv
abs,4.0,4.0,diy.csv
access-panel,2.0,4.0,diy.csv
accessibility,,3.0,diy.csv
acid,,1.0,diy.csv
acoustic,3.0,12.0,diy.csv
acrylic,9.0,4.0,diy.csv
addition,1.0,1.0,diy.csv
adhesive,28.0,116.0,diy.csv


let's clean the rows so that they contain the following info:

* replace NaNs with 0
* calculate hit percentage
* number of words (for filtering)
* join the frames together

In [26]:
keyword_match_df.fillna(0, inplace=True)
keyword_match_stem_df.fillna(0, inplace=True)

In [29]:
keyword_match_stem_df

Unnamed: 0,hit,miss,source
120-240v,0.0,17.0,diy.csv
240v,34.0,35.0,diy.csv
abs,4.0,4.0,diy.csv
access-panel,2.0,4.0,diy.csv
accessibility,0.0,3.0,diy.csv
acid,0.0,1.0,diy.csv
acoustic,3.0,12.0,diy.csv
acrylic,9.0,4.0,diy.csv
addition,1.0,1.0,diy.csv
adhesive,28.0,116.0,diy.csv


In [31]:
keyword_match_df['match_percentage'] = keyword_match_df.hit/(keyword_match_df.hit + keyword_match_df.miss)
keyword_match_df['total'] = keyword_match_df.hit + keyword_match_df.miss




In [32]:
keyword_match_stem_df['match_stem_percentage'] = keyword_match_stem_df.hit/(keyword_match_stem_df.hit + keyword_match_stem_df.miss)
#rename columns before join
keyword_match_stem_df.rename(columns={'hit':'hit_stem','miss':'miss_stem'}, inplace=True)

In [33]:
#convert keyword to column and reset row numbering on both frames before join
keyword_match_df.index.name='keyword'
keyword_match_df.reset_index(inplace=True)
keyword_match_stem_df.index.name='keyword'
keyword_match_stem_df.reset_index(inplace=True)
#join
keyword_matches_combined = pd.merge(keyword_match_df, keyword_match_stem_df, on=['keyword','source'])

In [35]:
keyword_matches_combined

Unnamed: 0,keyword,hit,miss,source,match_percentage,total,hit_stem,miss_stem,match_stem_percentage
0,120-240v,0.0,17.0,diy.csv,0.000000,17.0,0.0,17.0,0.000000
1,240v,34.0,35.0,diy.csv,0.492754,69.0,34.0,35.0,0.492754
2,abs,4.0,4.0,diy.csv,0.500000,8.0,4.0,4.0,0.500000
3,access-panel,2.0,4.0,diy.csv,0.333333,6.0,2.0,4.0,0.333333
4,accessibility,0.0,3.0,diy.csv,0.000000,3.0,0.0,3.0,0.000000
5,acid,0.0,1.0,diy.csv,0.000000,1.0,0.0,1.0,0.000000
6,acoustic,2.0,13.0,diy.csv,0.133333,15.0,3.0,12.0,0.200000
7,acrylic,9.0,4.0,diy.csv,0.692308,13.0,9.0,4.0,0.692308
8,addition,1.0,1.0,diy.csv,0.500000,2.0,1.0,1.0,0.500000
9,adhesive,27.0,117.0,diy.csv,0.187500,144.0,28.0,116.0,0.194444


In [41]:
keyword_matches_combined['keyword_length'] = keyword_matches_combined.keyword.apply(lambda x: x.count('-') + 1)

In [55]:
keyword_matches_combined.sort_values(by=['source','total'], ascending = [True, False], inplace=True)
keyword_matches_combined['hit_difference'] = (keyword_matches_combined.hit_stem - keyword_matches_combined.hit)/keyword_matches_combined.total

In [57]:
keyword_matches_combined[['keyword','source','hit_difference','total']].sort_values('hit_difference', ascending=False)

Unnamed: 0,keyword,source,hit_difference,total
1867,caravans,travel.csv,1.000000,3.0
3743,searing,cooking.csv,1.000000,1.0
1788,boarding-passes,travel.csv,1.000000,1.0
2509,moldovan-citizens,travel.csv,1.000000,1.0
240,flush-mount,diy.csv,1.000000,1.0
2812,scooters,travel.csv,1.000000,6.0
690,watering,diy.csv,1.000000,1.0
3373,differences,cooking.csv,1.000000,4.0
1036,indicator,biology.csv,1.000000,1.0
549,sip,diy.csv,1.000000,1.0


it seems like there are very few cases where this stemming procedure is harmful (3, most involving the use of the word "roof")

some summary statistics from the hits and misses:

In [64]:
#weighted average of non-stemmed

g1 = keyword_matches_combined.groupby(by='source')

print 'non-stemmed'
print g1.sum().pipe(lambda df: df['hit']/df['total'])
print '\nstemmed'
print g1.sum().pipe(lambda df: df['hit_stem']/df['total'])
print '\nratio'
print g1.sum().pipe(lambda df: df['hit_stem']/df['hit'])

non-stemmed
source
biology.csv     0.115655
cooking.csv     0.391503
crypto.csv      0.303525
diy.csv         0.331272
robotics.csv    0.271806
travel.csv      0.269787
dtype: float64

stemmed
source
biology.csv     0.145329
cooking.csv     0.472964
crypto.csv      0.349106
diy.csv         0.432310
robotics.csv    0.313575
travel.csv      0.392021
dtype: float64

ratio
source
biology.csv     1.256572
cooking.csv     1.208072
crypto.csv      1.150172
diy.csv         1.305000
robotics.csv    1.153672
travel.csv      1.453078
dtype: float64


Biology has a low count, and other categories are fairly low, as well. There is an increase of information of 15-45% when considering stemmed forms of words.

Using more traditional methods, the cooking, crypto, and diy categories would likely perform the best, while biology would perform terribly. It is uncertain how physics would perform, but since biology is a science and performs terribly, let's see which categories get the most misses.

In [65]:
biology = keyword_matches_combined.loc[keyword_matches_combined.source=='biology.csv']

In [67]:
biology.sort_values(by='miss', ascending=False)

Unnamed: 0,keyword,hit,miss,source,match_percentage,total,hit_stem,miss_stem,match_stem_percentage,keyword_length,hit_difference
1012,human-biology,1.0,1447.0,biology.csv,0.000691,1448.0,1.0,1447.0,0.000691,2,0.000000
970,genetics,38.0,1191.0,biology.csv,0.030919,1229.0,38.0,1191.0,0.030919,1,0.000000
767,biochemistry,3.0,981.0,biology.csv,0.003049,984.0,3.0,981.0,0.003049,1,0.000000
929,evolution,179.0,980.0,biology.csv,0.154443,1159.0,179.0,980.0,0.154443,1,0.000000
1116,molecular-biology,7.0,856.0,biology.csv,0.008111,863.0,7.0,856.0,0.008111,2,0.000000
808,cell-biology,4.0,755.0,biology.csv,0.005270,759.0,4.0,755.0,0.005270,2,0.000000
773,bioinformatics,16.0,647.0,biology.csv,0.024133,663.0,16.0,647.0,0.024133,1,0.000000
1141,neuroscience,9.0,605.0,biology.csv,0.014658,614.0,9.0,605.0,0.014658,1,0.000000
794,botany,2.0,563.0,biology.csv,0.003540,565.0,2.0,563.0,0.003540,1,0.000000
1190,physiology,8.0,548.0,biology.csv,0.014388,556.0,8.0,548.0,0.014388,1,0.000000


As I expected, the main source for the issue of keywords not being present in the content of the question is the fact that it is a general category, and obvious to humans from context. If you look at the top row, only 1 out of 1448 of the questions labeled "human-biology" contain the phrase "human biology" or "human-biology".

I could check to see if the words "human" and "biology" exist in these questions, but it should be somewhat obvious that there are word associations in play that might do a better trick.

Finally, I will prepare data for export to Gephi with the edgelists and nodelists.

In [68]:
keyword_pairs

{'biology.csv': Counter({frozenset({'dna', 'light'}): 1,
          frozenset({'enzymes', 'protein-expression'}): 1,
          frozenset({'evolution', 'plant-physiology'}): 14,
          frozenset({'autoimmune', 'celiac-disease'}): 1,
          frozenset({'heart-output', 'signal-processing'}): 1,
          frozenset({'blood-pressure', 'measurement'}): 2,
          frozenset({'evolution', 'osmoregulation'}): 1,
          frozenset({'ecosystem', 'predation'}): 2,
          frozenset({'cancer', 'reference-request'}): 3,
          frozenset({'evolution', 'muscles'}): 2,
          frozenset({'development', 'evolution'}): 16,
          frozenset({'endocrinology', 'sexual-reproduction'}): 1,
          frozenset({'ecology', 'entomology'}): 14,
          frozenset({'digestive-system', 'pathology'}): 4,
          frozenset({'medicinal-chemistry', 'microbiology'}): 1,
          frozenset({'mrna', 'translation'}): 4,
          frozenset({'ligation', 'molecular-biology'}): 1,
          frozenset({'h

In [71]:
pairs_df = pd.DataFrame.from_dict(keyword_pairs)

In [72]:
pairs_df

Unnamed: 0,biology.csv,cooking.csv,crypto.csv,diy.csv,robotics.csv,travel.csv
"(light, dna)",1.0,,,,,
"(enzymes, protein-expression)",1.0,,,,,
"(zoology, salt)",1.0,,,,,
"(proteins, cloning)",1.0,,,,,
"(autoimmune, celiac-disease)",1.0,,,,,
"(signal-processing, heart-output)",1.0,,,,,
"(blood-pressure, measurement)",2.0,,,,,
"(evolution, osmoregulation)",1.0,,,,,
"(reference-request, cancer)",3.0,,,,,
"(evolution, muscles)",2.0,,,,,


In [81]:
pairs_df.index.name = 'keyword_pair'
pairs_df.reset_index(inplace=True)

In [82]:

pairs_df['from'] = pairs_df.keyword_pair.apply(lambda x: sorted([i for i in x])[0])
pairs_df['to'] = pairs_df.keyword_pair.apply(lambda x: sorted([i for i in x])[1])

In [84]:
#check to see if each row has same number of NaNs 
#(should be multiple of 5)
pairs_df.isnull().sum(axis=1).sum()

397641

great...I guess I will make sure to keep all group variables at one level next time I do an aggregation like this...

In [88]:
def isolate_column(x, colname):
    subframe = x.copy()[['from','to',colname]].dropna(thresh=1)
    subframe.rename(columns={colname:'weight'})
    subframe['source'] = colname
    return subframe

edgelist = pd.concat([isolate_column(pairs_df, colname) for colname in categories])

In [89]:
edgelist['type'] = 'undirected'

on second thought, a nodelist will be unnecessarily complicated because I would have to include the names of the sources in each term to avoid duplication; instead I will color the edges by the source of the connection, and sparse connections between categories should be sufficient

In [90]:
edgelist.to_csv('edgelist.csv',index=False)

In [None]:
keyword_matches_combined.to