## Organising data by section

The following python code reads the csv files that contain all Guardian articles for each year. This data is compared to a list of female and male names that has been compiled from [this dataset](https://data.world/howarder/gender-by-name). From this comparison a new csv file is created which shows the amount of bylines that are male and female each year. If an article has more than one author then it is counted twice, once for each author.

In [11]:
import pandas as pd
import csv

In [12]:
# a function that tallies the number or bylines that are male, female or unknown
def sort(data, number):
    male_articles=0
    female_articles=0
    unknown_articles=0
    # account for different database layouts
    if number > 2011:
        #check for multiple authors
        authors = str(row[6]).split(",")
        section = row[7]
    else:
        authors = str(row[5]).split(",")
        section = row[2]
    # check for null 
    if not authors:
        unknown_articles +=1
    else:
        for author in authors:
            names = str.split(author)
            if not names:
                continue
            else:
                # account for articles written as Guardian, otherwise would be classed as male via dataset
                if names[0] == "Guardian":
                    unknown_articles += 1
                elif names[0] in female:
                    female_articles += 1
                elif names[0] in male:
                    male_articles += 1
                else:
                    unknown_articles += 1
            i=0
            # check again for more authors 
            for name in names:
                if i < (len(names))-1:
                    if name == "and":
                        if names[i+1] == "Guardian":
                            unknown_articles += 1
                        elif names[i+1] in female:
                            female_articles += 1
                        elif names[i+1] in male:
                            male_articles += 1
            i+=1

    d = {
            'section' : section,
            'male': male_articles,
            'female': female_articles,
        }

    return d

In [13]:
# separate male and female names from dataset
df = pd.read_csv("name_gender.csv")

In [14]:
# create empty lists
male = set()
female = set()

#add names to each list 
for row in df.itertuples():
    if row[2] == "M":
        male.add(row[1])
    if row[2] == "F":
        female.add(row[1])

In [16]:
# read csv document for each year
year = 2008
while year<2018:
    print(year)
    results = []
    database = pd.read_csv('articles' + str(year) + '.csv')
    #analyse gender 
    for row in database.itertuples():
        d = sort(row, year)
        results.append(d)
    df2 = pd.DataFrame(results)
    # group data by section
    df3 = df2.groupby("section").sum()
    # create new csv with all sections 
    df3.to_csv('all_sections' + str(year) + '.csv')
    year +=1
    

2008
2009
2010
2011
2012
2013
2014
2015
2016
2017


In [19]:
# function that lets you choose top sections 
def clean_sections(year):
    # read in whichever year you want to choose top sections of
    database = pd.read_csv('all_sections' + str(year) +'.csv')
    
    # create final empty list for results
    final = []
    
    # read each row to see if big enough to be in top 
    for row in database.itertuples():
        
    # number must be adjusted depending on how many sections you want 
        if (int(row[2]) + int(row[3]) > 1700):
            d ={
                "section" : row[1],
                "female" : row[2],
                "male" : row[3],
            }
            final.append(d)

    # create dataframe and check to see how many sections you have and adjust 
    df4 = pd.DataFrame(final)
    df4
    
    #set index to section for easier visualisation
    df5 = df4.set_index('section')
    
    df5.to_csv('top_sections' + str(year) + '.csv')
    print("done")

In [20]:
clean_sections(2008)

done


done


In [398]:
database = pd.read_csv("messy_sections.csv")

In [422]:
final= []

for row in database.itertuples():
    if (int(row[2]) + int(row[3]) > 1700):
        d ={
            "section" : row[1],
            "female" : row[2],
            "male" : row[3],
        }
        final.append(d)
        
        

In [423]:
df4 = pd.DataFrame(final)
df4

Unnamed: 0,female,male,section
0,1861,2571,Books
1,3134,6747,Business
2,549,1152,Culture
3,1934,1065,Education
4,895,1922,Environment
5,719,3102,Film
6,672,7408,Football
7,2458,2308,Life and style
8,3473,6098,Media
9,1731,1591,Money


In [424]:
df5 = df4.set_index('section')

In [425]:
df5.to_csv('2008_sections.csv')
print("done")

done
