### Challenge: Data cleaning & validation

- Determine the five most common journals and the total articles for each. 
- Calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

- Real bonus round, identify the open access prices paid by subject area.

In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re #regex library
from fuzzywuzzy import fuzz

%matplotlib inline

#CREATING AND PREPARING DATAFRAMES
    
#Creating original dataframe 
apc_original = pd.read_csv('WELLCOME_APC_original.csv', encoding ='ISO-8859-1')

#Creating a dataframe to clean 
apc = pd.read_csv('WELLCOME_APC.csv', encoding = 'ISO-8859-1')

#Removing white spaces from column names
for col in apc.columns:
    apc.rename(columns={col:col.replace(' ','_')},inplace=True)

apc.rename(columns={'COST_(£)_charged_to_Wellcome_(inc_VAT_when_charged)':'cost'},inplace=True)    
apc.head(5)


Unnamed: 0,PMID/PMCID,Publisher,Journal_title,Article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [109]:
#Removing white spaces from the begging and final positions of the records

apc[['Journal_title','Publisher','Article_title']].apply(lambda x : str(x).strip().lstrip())


Journal_title    0                                  Psychologic...
Publisher        0                                             ...
Article_title    0       Reduced parahippocampal cortical thick...
dtype: object

In [110]:
#Looking for NULL values
apc.isnull().sum()
apc['Journal_title'].nunique()

984

In [111]:
#Functions


def getAbbreviation(string):
    
    """ Function: getAbbreviation 
        Parameters: a string with 1 or more words
        Return: a string's abbreviation, taking the first letter of each word in the string given
    """ 
    
    abbrev = []
    words = []
    pattern = re.compile('^[A-Z]{3}$')
    
    exclude_words = ['the','of','for','with','at','&','and','in','on','an']
    
    for w in str(string).split(' '):
       
        if w.lower() not in exclude_words:
            words.append(w)

    for word in words:
        if len(word) != 0 :
            result = pattern.match(word)
            if pattern.match(word):
                abbrev.append(word)
                break
            else:
                for letter in word[0]:
                    abbrev.append(letter[0])

    return ''.join(abbrev)



def getIntersectionP(x,y):
    
    """ Function: getIntersection 
    Parameters: x =       , y = 
    Return: a string's abbreviation, taking the first letter of each word in the string given
    """   
    
    if (x,y) in similarRowsPublisher.keys(): 
        return similarRowsPublisher[(x,y)]
    else:
        return -1
    
def getIntersectionJ(x,y):
    
    """ Function: getIntersection 
    Parameters: x =       , y = 
    Return: a string's abbreviation, taking the first letter of each word in the string given
    """   
    
    if (x,y) in similarRowsJournal.keys(): 
        return similarRowsJournal[(x,y)]
    else:
        return -1

    
def changeifSimilar(dataframe,c1,c2,c3):
    
    """ Function: changeifSimilar 
        Parameters: dataframe
                    c1 is the column's name where the original name is, this string may or may not change; 
                    c2 is the column's name where the proposed new name is. The suggestion is based on the number of matchs in dataset
                    c3 is the column's name where the original name abbreavtion is.
        Return: a string (it may be the original name or the one that matchs with similar names)
    """
      
    original_name = dataframe[c1]
    pattern = dataframe[c2]
    abb = dataframe[c3]

    ratio_name = fuzz.partial_ratio(str(pattern).lower(),str(original_name).lower())
    name = original_name
        
    if ((ratio_name > 89) or (original_name.upper() == abb)):        
        name = pattern
        
    else:
        for word in original_name.split():
            if word.upper() in abb :               
                name = pattern 
                break
    
    return name
    

    
def findMatch(dataframe,c1,c2,c3):

    """Function: findMatch
            Parameters: dataframe 
            c1 is the column's name where the abbreviation is.
            c2 is the column's name where the number of similar rows is.
            c3 is the column's name where the Publisher o Journal Title is.
            Return: a string (it's a suggested name based on how much matchs with similar names)

    """
    
    match = ''    
    abb = dataframe[c1] 
    maxRows = apc[apc[c1]==abb][c2].max()
    match = apc[(apc[c1]==abb)&(apc[c2]==maxRows)][c3].iloc[0]
    return match


In [112]:
apc['Publisher'] = apc_original['Publisher']


#STEP  : Creating abbreviation column (SET A= Abbreviations)
apc['Publisher_abbreviation'] = apc['Publisher'].apply(lambda x: getAbbreviation(x)) 
apc['Journal_abbreviation'] = apc['Journal_title'].apply(lambda x: getAbbreviation(x))


#STEP  : Lowerizing and titlerizing publishers' names (SET B = Names)
apc['Publisher'] = apc['Publisher'].apply(lambda x:str(x).lower().title())
apc['Journal_title'] = apc['Journal_title'].apply(lambda x:str(x).lower().title())



# STEP : Creating a new set (SET C = A ∩ B ) to verify if the intersection are same publishers or journal base on abbreviations
# or similiraty of names

intersectionPublisher = apc[['Publisher','Publisher_abbreviation']].groupby(['Publisher','Publisher_abbreviation'])['Publisher','Publisher_abbreviation'].count()

similarRowsPublisher={}
for ind,x in intersectionPublisher.iterrows():
    val = intersectionPublisher.loc[ind,'Publisher']
    similarRowsPublisher[ind] = val


intersectionJournal = apc[['Journal_title','Journal_abbreviation']].groupby(['Journal_title','Journal_abbreviation'])['Journal_title','Journal_abbreviation'].count()

similarRowsJournal={}
for ind,x in intersectionJournal.iterrows():
    val = intersectionJournal.loc[ind,'Journal_title']
    similarRowsJournal[ind] = val

    
    
apc['Publisher_similarRows'] = apc.apply(lambda x: getIntersectionP(x['Publisher'] ,x['Publisher_abbreviation']), axis=1)
apc['Journal_similarRows'] = apc.apply(lambda x: getIntersectionJ(x['Journal_title'] ,x['Journal_abbreviation']), axis=1)

#Finding matchs for Journal and Pubisher's names

apc['Publisher_match']=apc.apply(lambda x: findMatch(x,'Publisher_abbreviation','Publisher_similarRows','Publisher'),1)
apc['Journal_match']=apc.apply(lambda x: findMatch(x,'Journal_abbreviation','Journal_similarRows','Journal_title'),1)    

#Uniforming names if Journal and Pubisher's names are similar to others
apc['Publisher_newname'] = apc.apply(lambda x: changeifSimilar(x,'Publisher','Publisher_match','Publisher_abbreviation'),1)
apc['Journal_newname']=apc.apply(lambda x: changeifSimilar(x,'Journal_title','Journal_match','Journal_abbreviation'),1) 



In [113]:
apc[['Publisher','Publisher_match','Publisher_newname','Publisher_abbreviation']].head(100)

Unnamed: 0,Publisher,Publisher_match,Publisher_newname,Publisher_abbreviation
0,Cup,Cambridge University Press,Cambridge University Press,CUP
1,Acs,American Chemical Society,American Chemical Society,ACS
2,Acs,American Chemical Society,American Chemical Society,ACS
3,Acs,American Chemical Society,American Chemical Society,ACS
4,Acs,American Chemical Society,American Chemical Society,ACS
5,Acs,American Chemical Society,American Chemical Society,ACS
6,Acs,American Chemical Society,American Chemical Society,ACS
7,Acs,American Chemical Society,American Chemical Society,ACS
8,Acs (Amercian Chemical Society) Publications,American Chemical Society,American Chemical Society,ACS
9,Acs (Amercian Chemical Society) Publications,American Chemical Society,American Chemical Society,ACS


## Cleaning is done, then we can:

### 1. Determine the five most common journals and the total articles for each

In [121]:
most_common_journal = apc['Journal_newname'].value_counts()
print(most_common_journal[0:5])

Plos One                           191
Journal Of Biological Chemistry     65
Bmc Public Health                   43
Neuroimage                          31
Nucleic Acids Research              29
Name: Journal_newname, dtype: int64


### 2. Calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [122]:
#Cleaning cost column (removing pound signs and convert str to float)

def cleaningNumbers(dirtyNumbers):
    number = 0.00
    if dirtyNumbers != []:
        number = float(dirtyNumbers[0])
    return number

apc['cost_number'] = apc.cost.apply(lambda x:  cleaningNumbers(re.findall(r'\d*\.\d*$',x)))


#Calculate the mean, median, and standard deviation of the open-access cost per article for each journal.
results = pd.DataFrame(apc[['Journal_newname','cost_number']])

total_statistics['mean'] = results[['Journal_newname','cost_number']].groupby('Journal_newname').mean()
total_statistics['median'] = results[['Journal_newname','cost_number']].groupby('Journal_newname').median()
total_statistics['std'] = results[['Journal_newname','cost_number']].groupby('Journal_newname').std()


round(total_statistics[['mean','median','std']].head(100),2)

Unnamed: 0_level_0,mean,median,std
Journal_newname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Academy Of Nutrition And Dietetics,2379.54,2379.54,
Acs Chemical Biology,1201.75,1227.28,511.95
Acs Chemical Neuroscience,,,
Acs Nano,,,
Acta Crystallographica Section D: Biological Crystallography,772.58,772.58,1.64
Acta Crystallographica Section F: Structural Biology And Crystallization Communications,796.64,796.64,15.61
"Acta Crystallographica, Section D",757.18,757.18,
Acta Crystallography D,774.19,774.19,
Acta D,1543.22,1543.22,1121.56
Acta Dermato Venereologica,653.96,653.96,
