In [12]:
# modules we'll use
import pandas as pd
import numpy as np

# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# read in all our data
professors = pd.read_csv("/Users/mountasser/Desktop/Data-Cleaning/Inconsistent Data Entry/pakistan_intellectual_capital.csv")

# set seed for reproducibility
np.random.seed(0)

print(professors.head())


   Unnamed: 0  S#         Teacher Name  \
0           2   3      Dr. Abdul Basit   
1           4   5      Dr. Waheed Noor   
2           5   6     Dr. Junaid Baber   
3           6   7  Dr. Maheen Bakhtyar   
4          24  25          Samina Azim   

            University Currently Teaching             Department  \
0               University of Balochistan  Computer Science & IT   
1               University of Balochistan  Computer Science & IT   
2               University of Balochistan  Computer Science & IT   
3               University of Balochistan  Computer Science & IT   
4  Sardar Bahadur Khan Women's University       Computer Science   

  Province University Located          Designation Terminal Degree  \
0                 Balochistan  Assistant Professor             PhD   
1                 Balochistan  Assistant Professor             PhD   
2                 Balochistan  Assistant Professor             PhD   
3                 Balochistan  Assistant Professor        

In [14]:
# convert to lower case
professors['Country'] = professors['Country'].str.lower()
# remove trailing white spaces
professors['Country'] = professors['Country'].str.strip()

# get the top 10 closest matches to "south korea"
countries = professors['Country'].unique()
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")
    
replace_matches_in_column(df=professors, column='Country', string_to_match="south korea")
countries = professors['Country'].unique()

print(countries)

All done!
['thailand' 'pakistan' 'germany' 'austria' 'australia' 'uk' 'china'
 'france' 'usofa' 'south korea' 'malaysia' 'sweden' 'italy' 'canada'
 'norway' 'ireland' 'new zealand' 'urbana' 'portugal' 'russian federation'
 'usa' 'finland' 'netherland' 'greece' 'turkey' 'macau' 'singapore'
 'spain' 'japan' 'hongkong' 'saudi arabia' 'mauritius' 'scotland']


In [18]:
# TODO: Your code here
professors["Graduated from"] = professors["Graduated from"].str.strip()
print(professors["Graduated from"].unique())

['Asian Institute of Technology'
 'Balochistan University of Information Technology, Engineering and Management Sciences'
 'University of Balochistan' "Sardar Bahadur Khan Women's University"
 'SRH Hochschule Heidelberg'
 'Institute of Business Administration,Karachi' 'DUET,Karachi'
 'University of Turbat' 'University of Vienna' 'Monash University'
 'University of Stirling' 'Chinese Academy of Sciences'
 'University of Innsbruck' 'Vienna University of Technology'
 'University of Paris-Est' 'The University of Cambridge'
 'Harbin Institute of Technology' 'University of Nice, Sophia Antipolis'
 'The University of York' 'Galilée - Université Paris 13'
 'University of Bedfordshire' 'North Dakota State University'
 'Kyungpook National University' 'The University of Manchester'
 'National University of Sciences and Technology'
 'FAST– National University of Computer and Emerging Sciences'
 'Capital University of Science & Technology' 'Gomal University'
 'University of Malaya' 'KTH Royal Insti

In [20]:
# get all the unique values in the 'City' column
countries = professors['Country'].unique()

# sort them alphabetically and then take a closer look
countries.sort()
print(countries)

['australia' 'austria' 'canada' 'china' 'finland' 'france' 'germany'
 'greece' 'hongkong' 'ireland' 'italy' 'japan' 'macau' 'malaysia'
 'mauritius' 'netherland' 'new zealand' 'norway' 'pakistan' 'portugal'
 'russian federation' 'saudi arabia' 'scotland' 'singapore' 'south korea'
 'spain' 'sweden' 'thailand' 'turkey' 'uk' 'urbana' 'usa' 'usofa']


In [21]:
# TODO: Your code here!
matches = fuzzywuzzy.process.extract("usa", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
#print(matches)
replace_matches_in_column(df=professors, column='Country', string_to_match="usa", min_ratio=70)
print(professors['Country'].unique())

All done!
['thailand' 'pakistan' 'germany' 'austria' 'australia' 'uk' 'china'
 'france' 'usa' 'south korea' 'malaysia' 'sweden' 'italy' 'canada'
 'norway' 'ireland' 'new zealand' 'urbana' 'portugal' 'russian federation'
 'finland' 'netherland' 'greece' 'turkey' 'macau' 'singapore' 'spain'
 'japan' 'hongkong' 'saudi arabia' 'mauritius' 'scotland']
