In [5]:
import pandas as pd
import numpy as np
import sqlite3


In [35]:
con = sqlite3.connect("Pitchfork.sqlite")
df_pitchfork = pd.read_sql_query("SELECT * FROM reviews", con)

# Verify that result of SQL query is stored in the dataframe
print(df_pitchfork.head())

con.close()

   reviewid                 title            artist  \
0     22703             mezzanine    massive attack   
1     22721          prelapsarian          krallice   
2     22659  all of them naturals      uranium club   
3     22661           first songs  kleenex, liliput   
4     22725             new start              taso   

                                                 url  score  best_new_music  \
0  http://pitchfork.com/reviews/albums/22703-mezz...    9.3               0   
1  http://pitchfork.com/reviews/albums/22721-prel...    7.9               0   
2  http://pitchfork.com/reviews/albums/22659-all-...    7.3               0   
3  http://pitchfork.com/reviews/albums/22661-firs...    9.0               1   
4  http://pitchfork.com/reviews/albums/22725-new-...    8.1               0   

           author               author_type    pub_date  pub_weekday  pub_day  \
0     nate patrin               contributor  2017-01-08            6        8   
1        zoe camp               

In [44]:
# def strip_whitespace(item):
#     return item.strip()
df_pitchfork['author'] = df_pitchfork['author'].apply(lambda d:d.strip())


In [45]:
df_pitchfork.head(30)

Unnamed: 0,reviewid,title,artist,url,score,best_new_music,author,author_type,pub_date,pub_weekday,pub_day,pub_month,pub_year
0,22703,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017
1,22721,prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017
2,22659,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017
3,22661,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017
4,22725,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017
5,22722,insecure (music from the hbo original series),various artists,http://pitchfork.com/reviews/albums/22722-inse...,7.4,0,vanessa okoth-obbo,contributor,2017-01-05,3,5,1,2017
6,22704,stillness in wonderland,little simz,http://pitchfork.com/reviews/albums/22704-litt...,7.1,0,katherine st. asaph,contributor,2017-01-05,3,5,1,2017
7,22694,tehillim,yotam avni,http://pitchfork.com/reviews/albums/22694-tehi...,7.0,0,andy beta,contributor,2017-01-05,3,5,1,2017
8,22714,reflection,brian eno,http://pitchfork.com/reviews/albums/22714-refl...,7.7,0,andy beta,contributor,2017-01-04,2,4,1,2017
9,22724,filthy america its beautiful,the lox,http://pitchfork.com/reviews/albums/22724-filt...,5.3,0,ian cohen,contributor,2017-01-04,2,4,1,2017


In [46]:
unique_authors = df_pitchfork['author'].unique()
print(len(unique_authors), ' ', unique_authors)

423   ['nate patrin' 'zoe camp' 'david glickman' 'jenn pelly' 'kevin lozano'
 'vanessa okoth-obbo' 'katherine st. asaph' 'andy beta' 'ian cohen'
 'marc masters' 'sheldon pearce' 'thea ballard' 'marcus j. moore'
 'dean van nguyen' 'louis pattison' 'philip sherburne' 'benjamin scheim'
 'rebecca haithcoat' 'sam sodomsky' 'seth colter walls' 'brian howe'
 'marc hogan' 'quinn moreland' 'seth colter-walls' 'savy reyes-kulkarni'
 'renato pagnani' 'matthew strauss' 'david turner' 'matthew ramirez'
 'jesse jarnow' 'andrew gaerig' 'paul a. thompson' 'nathan reese'
 'ryan dombal' 'jayson greene' 'cameron cook' 'saby reyes-kulkarni'
 'mehan jayasuriya' 'ben scheim' 'stuart berman' 'israel daramola'
 'evan rytlewski' 'daniel martin-mccormick' 'jay balfour' 'drew gaerig'
 'mark richardson' 'paul thompson' 'edwin "stats" houghton' 'laura snapes'
 'eric harvey' 'brad nelson' 'jonathan bernstein' 'caryn rose'
 'jes skolnik' 'miles raymer' 'nina mashurova' 'dorian lynskey'
 'jonathan patrick' 'kris ex' 

In [47]:
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return "The strings are {} edits away".format(distance[row][col])

In [48]:
# authors = {}
print(unique_authors.sort())
# for author in unique_authors:
#     author = author.rstrip()
    
# unique_authors = unique_authors.unique()
# print(len(unique_authors), ' ', unique_authors)

for item in unique_authors:
    print(item)

None
aaron leitko
abby garnett
abigail covington
abigail garnett
adam dlugacz
adam moerder
adam ohler
adrienne day
al shipley
alan light
alan smithee
alex lindhart
alex linhardt
alexander iadarola
alexander lloyd linhardt
alison fields
allison hussey
amanda petrusich
amy granzin
amy phillips
andi rowlands
andrew bryant
andrew gaerig
andrew goldman
andrew lehman
andrew nosnitsky
andrew ryce
andy battaglia
andy beta
andy beta, brandon stosuy & mark richardson
andy emitt
andy linhardt
andy o' connor
andy o'connor
angus finlayson
anupa mistry
austin gaines
b michael payne
b. david zarley
barry walters
beatty & garrett
ben scheim
ben westhoff
benjamin scheim
bill morris
bob o. mcmillan
bob stanley
brad haywood
brad haywood & ryan schreiber
brad hurst
brad nelson
brandon soderberg
brandon stosuy
brandon wall
brendan mattox
brendan reid
brent dicrescenzo
brent s. sirota
brian burlage
brian howe
brian howe & brandon stosuy
brian james
britt julious
brock kappers
bruce tiffee
cameron cook
camer