# Part 1.4: Intermediate Python for Data Science

## Cleaning and Preparing Data in Python

Opening CSV file:

In [20]:
from csv import reader

opened_file = open('artworks.csv', encoding="utf8")
read_file = reader(opened_file)
moma = list(read_file)
moma_headers = moma[0]
moma = moma[1:]

Replacing substrings:

In [21]:
age1 = "I am thirty-one years old"
age2 = age1.replace("thirty-one","thirty-two")
print(age2)

I am thirty-two years old


Cleaning columns by replacing parentheses:

In [22]:
for row in moma:
    nationality = row[2]
    nationality = nationality.replace("(","")
    nationality = nationality.replace(")","")
    row[2] = nationality
    
for row in moma:
    gender = row[5]
    gender = gender.replace("(","")
    gender = gender.replace(")","")
    row[5] = gender

print(moma[200])

['She-Paul-A-Wee', 'William Henry Jackson', 'American', '(1843)', '(1942)', 'Male', '1875', 'Photography']


Cleaning capitalisation:

In [23]:
for row in moma:
    gender = row[5]
    gender = gender.title()
    if not gender:
        gender = "Gender Unknown/Other"
    row[5] = gender

for row in moma:
    nationality = row[2]
    nationality = nationality.title()
    if not nationality:
        nationality = "Nationality Unknown"
    row[2] = nationality
    
for row in moma[:5]:
    print(row[1],"whose gender is:",row[5])

Sarah Charlesworth whose gender is: Female
Pablo Palazuelo whose gender is: Male
Maurice Denis whose gender is: Male
Aristide Maillol whose gender is: Male
Eugène Atget whose gender is: Male


Cleaning the birthdays and death dates:

In [24]:
def clean_and_convert(date):
    # check that we don't have an empty string
    if date != "":
        # move the rest of the function inside
        # the if statement
        date = date.replace("(", "")
        date = date.replace(")", "")
        date = int(date)
    return date

for row in moma:
    birth_date = row[3]
    death_date = row[4]
    birth_date = clean_and_convert(birth_date)
    death_date = clean_and_convert(death_date)
    row[3] = birth_date
    row[4] = death_date
    
print(moma[:5])

[['Dress MacLeod from Tartan Sets', 'Sarah Charlesworth', 'American', 1947, 2013, 'Female', '1986', 'Prints & Illustrated Books'], ['Duplicate of plate from folio 11 verso (supplementary suite, plate 4) from ARDICIA', 'Pablo Palazuelo', 'Spanish', 1916, 2007, 'Male', '1978', 'Prints & Illustrated Books'], ['Tailpiece (page 55) from SAGESSE', 'Maurice Denis', 'French', 1870, 1943, 'Male', '1889-1911', 'Prints & Illustrated Books'], ['Headpiece (page 129) from LIVRET DE FOLASTRIES, À JANOT PARISIEN', 'Aristide Maillol', 'French', 1861, 1944, 'Male', '1927-1940', 'Prints & Illustrated Books'], ['97 rue du Bac', 'Eugène Atget', 'French', 1857, 1927, 'Male', '1903', 'Photography']]


Trying better print method:

In [25]:
import pandas as pd

pd.DataFrame(moma[:5],columns=moma_headers)

Unnamed: 0,Title,Artist,Nationality,BeginDate,EndDate,Gender,Date,Department
0,Dress MacLeod from Tartan Sets,Sarah Charlesworth,American,1947,2013,Female,1986,Prints & Illustrated Books
1,Duplicate of plate from folio 11 verso (supple...,Pablo Palazuelo,Spanish,1916,2007,Male,1978,Prints & Illustrated Books
2,Tailpiece (page 55) from SAGESSE,Maurice Denis,French,1870,1943,Male,1889-1911,Prints & Illustrated Books
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Aristide Maillol,French,1861,1944,Male,1927-1940,Prints & Illustrated Books
4,97 rue du Bac,Eugène Atget,French,1857,1927,Male,1903,Photography


Parsing numbers from complex strings:

In [26]:
test_data = ["1912", "1929", "1913-1923",
             "(1951)", "1994", "1934",
             "c. 1915", "1995", "c. 1912",
             "(1988)", "2002", "1957-1959",
             "c. 1955.", "c. 1970's", 
             "C. 1990-1999"]

bad_chars = ["(",")","c","C",".","s","'", " "]

def strip_characters(string):
    for char in bad_chars:
        string = string.replace(char, "")
    return string

stripped_test_data = []

for date in test_data:
    date = strip_characters(date)
    stripped_test_data.append(date)

print(stripped_test_data)

['1912', '1929', '1913-1923', '1951', '1994', '1934', '1915', '1995', '1912', '1988', '2002', '1957-1959', '1955', '1970', '1990-1999']


Parsing numbers from complex strings, part 2:

In [27]:
def strip_characters(string):
    for char in bad_chars:
        string = string.replace(char, "")
    return string

def process_date(string):
    if "-" in string:
        split_date = string.split("-")
        date_1 = split_date[0]
        date_2 = split_date[1]
        date = (int(date_1) + int(date_2)) / 2
        string = round(date)
    else:
        string = int(string)
    return string

processed_test_data = []

for date in stripped_test_data:
    newdate = process_date(date)
    processed_test_data.append(newdate)
    
print(processed_test_data)

for row in moma:
    date = row[6]
    date = strip_characters(date)
    date = process_date(date)
    row[6] = date
    
pd.DataFrame(moma[:5],columns=moma_headers)

[1912, 1929, 1918, 1951, 1994, 1934, 1915, 1995, 1912, 1988, 2002, 1958, 1955, 1970, 1994]


Unnamed: 0,Title,Artist,Nationality,BeginDate,EndDate,Gender,Date,Department
0,Dress MacLeod from Tartan Sets,Sarah Charlesworth,American,1947,2013,Female,1986,Prints & Illustrated Books
1,Duplicate of plate from folio 11 verso (supple...,Pablo Palazuelo,Spanish,1916,2007,Male,1978,Prints & Illustrated Books
2,Tailpiece (page 55) from SAGESSE,Maurice Denis,French,1870,1943,Male,1900,Prints & Illustrated Books
3,Headpiece (page 129) from LIVRET DE FOLASTRIES...,Aristide Maillol,French,1861,1944,Male,1934,Prints & Illustrated Books
4,97 rue du Bac,Eugène Atget,French,1857,1927,Male,1903,Photography


## Python Data Analysis Basics

Reading the dataset, cleaning date column:

In [28]:
from csv import reader

# Read the `artworks_clean.csv` file
opened_file = open('artworks_clean.csv', encoding="utf8")
read_file = reader(opened_file)
moma = list(read_file)
moma = moma[1:]

# Convert the birthdate values
for row in moma:
    birth_date = row[3]
    if birth_date != "":
        birth_date = int(birth_date)
    row[3] = birth_date
    
# Convert the death date values
for row in moma:
    death_date = row[4]
    if death_date != "":
        death_date = int(death_date)
    row[4] = death_date

# Write your code below

for row in moma:
    date = row[6]
    if date != "":
        date = int(date)
    row[6] = date

Calculating ages:

In [29]:
ages = []

for row in moma:
    date = row[6]
    birth = row[3]
    if type(birth) == int:
        age = date - birth
    else:
        age = 0
    ages.append(age)

final_ages = []

for age in ages:
    if age > 20:
        final_age = age
    else:
        final_age = "Unknown"
    final_ages.append(final_age)

Converting age to decade:

In [30]:
# The final_ages variable is available
# from the previous screen

decades = []

for age in final_ages:
    if age == "Unknown":
        decade = age
    else:
        decade = str(age)
        decade = decade[:-1]
        decade = decade + "0s"
    decades.append(decade)

Summarising the decade data:

In [31]:
decade_frequency = {}

for decade in decades:
    if decade in decade_frequency:
        decade_frequency[decade] += 1
    else:
        decade_frequency[decade] = 1
    
print(decade_frequency)

{'30s': 4722, '60s': 1357, '70s': 559, '40s': 4081, '50s': 2434, '20s': 1856, 'Unknown': 1093, '90s': 253, '80s': 364, '100s': 3, '110s': 3}


Inserting variables into strings:

In [32]:
artist = "Pablo Picasso"
birth_year = 1881

template = "{name}'s birth year is {year}"
output = template.format(name=artist, year = birth_year)
print(output)

Pablo Picasso's birth year is 1881


Creating frequency tables:

In [33]:
gender_freq = {}

for row in moma:
    gender = row[5]
    if gender in gender_freq:
        gender_freq[gender] += 1
    else:
        gender_freq[gender] = 1
        
print(gender_freq)

artist_freq = {}

for row in moma:
    artist = row[1]
    if artist in artist_freq:
        artist_freq[artist] += 1
    else:
        artist_freq[artist] = 1
        
print(
    pd.DataFrame(
    [[key] + [value] for key, value in artist_freq.items()],
    columns=['Arist','Frequency']
    )
)



{'Female': 2443, 'Male': 13491, 'Gender Unknown/Other': 791}
                       Arist  Frequency
0         Sarah Charlesworth          1
1            Pablo Palazuelo          4
2              Maurice Denis         71
3           Aristide Maillol         77
4               Eugène Atget        705
...                      ...        ...
3952           Julio Le Parc          1
3953             George Ault          1
3954    Jean-Michel Basquiat          1
3955             Henri Goetz          1
3956  Matthew Rackham Barnes          1

[3957 rows x 2 columns]


Creating a function to insert variables into strings:

In [34]:
def artist_summary(name):
    num_works = artist_freq[name]
    template = "There are {number} artworks by {artist_name} in the dataset"
    output = template.format(number = num_works, artist_name = name)
    print(output)
    
artist_summary("Henri Matisse")

There are 129 artworks by Henri Matisse in the dataset


Formatting numbers inside strings:

In [35]:
pop_millions = [
    ["China", 1379.302771],
    ["India", 1281.935991],
    ["USA",  326.625791],
    ["Indonesia",  260.580739],
    ["Brazil",  207.353391],
]

template = "The population of {country} is {number:,.2f} million"

for row in pop_millions:
    country = row[0]
    number = row[1]
    output = template.format(country=country, number=number)
    print(output)

The population of China is 1,379.30 million
The population of India is 1,281.94 million
The population of USA is 326.63 million
The population of Indonesia is 260.58 million
The population of Brazil is 207.35 million


Summarising the artwork gender data:

In [36]:
gender_freq = {}

for row in moma:
    gender = row[5]
    if gender in gender_freq:
        gender_freq[gender] += 1
    else:
        gender_freq[gender] = 1

for gender, num in gender_freq.items():
    template = "There are {n:,} artworks by {g} artists"
    print(template.format(g=gender, n=num))

There are 2,443 artworks by Female artists
There are 13,491 artworks by Male artists
There are 791 artworks by Gender Unknown/Other artists


## Object-Oriented Python

Determining type of object:

In [37]:
l = [1, 2, 3]
s = "string"
d = {"a": 1, "b": 2}

print(type(l))
print(type(s))
print(type(d))

<class 'list'>
<class 'str'>
<class 'dict'>


Using pass to avoid syntax errors:

In [38]:
class MyClass:

SyntaxError: incomplete input (1752094626.py, line 1)

In [39]:
class MyClass:
    pass

Creating an instance of a class:

In [40]:
class MyClass:
    pass
my_instance = MyClass()
print(type(my_instance))

<class '__main__.MyClass'>


Creating an instance of a basic class and calling a method:

In [41]:
class MyClass:
    
    def first_method(self):
        return "This is my first method"
    
my_instance = MyClass()

result = my_instance.first_method()
print(result)
print(type(result))

This is my first method
<class 'str'>


Creating an method that accepts an argument:

In [42]:
class MyClass:
    
    def first_method(self):
        return "This is my first method"
    
    # Add method here
    
    def return_list(self, input_list):
        return input_list
    
my_instance = MyClass()
result = my_instance.return_list([1,2,3])
print(result)

[1, 2, 3]


Using the init method when instantiating an object:

In [43]:
## Create a new class called MyList
class MyList:

    ## When an instance of this class is created, this "initial_data" argument is stored in the attribute "self.data"
    def __init__(self, initial_data):
        self.data = initial_data

## Create an instance of the class MyList called my_list with the argument [1,2,3,4,5], which will be stored in the self.data attribute.
my_list = MyList([1,2,3,4,5])
    
## Print the .data attribute of the my_list object (instance of the MyList class).
print(my_list.data)

[1, 2, 3, 4, 5]


Using an append method in a class/on it:

In [44]:
class MyList:

    def __init__(self, initial_data):
        self.data = initial_data
        
    # Add method here
    
    def append(self, new_item):
        self.data = self.data + [new_item]
        
my_list = MyList([1,2,3,4,5])
print(my_list.data)
my_list.append(6)
print(my_list.data)

[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5, 6]


Creating and updating an attribute:

In [45]:
class MyList:

    def __init__(self, initial_data):
        self.data = initial_data
        # Calculate the initial length
        self.length = 0
        for item in self.data:
            self.length += 1

    def append(self, new_item):
        self.data = self.data + [new_item]
        # Update the length here
        self.length += 1

my_list = MyList([1,1,2,3,5])
print(my_list.data)
print(my_list.length)
my_list.append(8)
print(my_list.data)
print(my_list.length)

[1, 1, 2, 3, 5]
5
[1, 1, 2, 3, 5, 8]
6


## Working with Dates and Times in Python

Opening csv and assigning list of lists to variable, thne removing first row:

In [46]:
from csv import reader
opened_file = open('potus_visitors_2015.csv')
read_file = reader(opened_file)
potus = list(read_file)

potus_headers = potus[0]
potus = potus[1:]
print(len(potus))

30740


Importing datetime module, creating an object using the function and printing examples:

In [47]:
## Import module with alias
import datetime as dt

# Create an object using the datetime function
eg_1 = dt.datetime(2000, 1, 1)

## Print the object to see it's value and type
print(eg_1)
print(type(eg_1))

eg_2 = dt.datetime(1985,3,13,21,26,2)
print(eg_2)

eg_3 = dt.datetime(1998,7,7,8,39)
print(eg_3)

ibm_founded = dt.datetime(1911,6,16)
man_on_moon = dt.datetime(1969,7,20,20,17)

2000-01-01 00:00:00
<class 'datetime.datetime'>
1985-03-13 21:26:02
1998-07-07 08:39:00


Using strptime constructors to convert the date string to a datetime object in a format assigned:

In [48]:
date_format = "%m/%d/%Y %H:%M" ## course csv was premade to have %y

for row in potus:
    start_date = row[1]
    new_date = dt.datetime.strptime(start_date, date_format)
    row[1] = new_date
    
for row in potus[:5]:
    print(row[1])

2015-01-06 09:30:00
2015-01-06 09:30:00
2015-01-06 09:30:00
2015-01-06 09:30:00
2015-01-06 09:30:00


Taking month and year from the second column, converting to a string, and counting frequency of visits per month/year:

In [49]:
visitors_per_month = {}

for row in potus:
    start_date = row[1]
    sdstring = dt.datetime.strftime(start_date, "%B, %Y")
    if sdstring in visitors_per_month:
        visitors_per_month[sdstring] += 1
    else:
        visitors_per_month[sdstring] = 1
        
df = pd.DataFrame(
    [[key] + [value] for key, value in visitors_per_month.items()],
    columns=['Month/Year','Frequency']
)
print(df)

         Month/Year  Frequency
0     January, 2015        935
1    February, 2015       1091
2       March, 2015       1621
3       April, 2015       4180
4         May, 2015       1956
5        June, 2015       7660
6        July, 2015       2068
7      August, 2015        939
8   September, 2015       3976
9     October, 2015       3196
10   November, 2015       1118
11   December, 2015       2000


In [50]:
appointment_times = []

for row in potus:
    app_datetime = row[1]
    app_time = app_datetime.time()
    appointment_times.append(app_time)
    
print(appointment_times[:5])

[datetime.time(9, 30), datetime.time(9, 30), datetime.time(9, 30), datetime.time(9, 30), datetime.time(9, 30)]


Determining min and max values from datetime format:

In [53]:
print(type(appointment_times[0]))
min_time = min(appointment_times)
max_time = max(appointment_times)
print(min_time)
print(max_time)

<class 'datetime.time'>
06:00:00
21:30:00


Calculating periods between datetimes and periods before or after a determined length:

In [54]:
## January 31st 1981, 00:00:00
dt_1 = dt.datetime(1981, 1, 31)
## June 28th 1984, 00:00:00
dt_2 = dt.datetime(1984, 6, 28)
## May 24th 2016, 00:00:00
dt_3 = dt.datetime(2016, 5, 24)
## January 1st 2001, 08:24:13
dt_4 = dt.datetime(2001, 1, 1, 8, 24, 13)

answer_1 = dt_2 - dt_1
print(answer_1)
answer_2 = dt_3 + dt.timedelta(days=56)
print(answer_2)
answer_3 = dt_4 - dt.timedelta(seconds=3600)
print(answer_3)

1244 days, 0:00:00
2016-07-19 00:00:00
2001-01-01 07:24:13


Calculating lengths of appointments then creating a frequency table:

In [55]:
appt_lengths = {}

## Converting the end appointment time to datetime format
for row in potus:
    end_date = row[2]
    end_date = dt.datetime.strptime(end_date, "%m/%d/%Y %H:%M")
    row[2] = end_date

## Calculating the length of the appointment and then creating frequencies of lengths of appointments
for row in potus:
    start_date = row[1]
    end_date = row[2]
    length = end_date - start_date
    if length in appt_lengths:
        appt_lengths[length] += 1
    else:
        appt_lengths[length] = 1

min_length = min(appt_lengths)
max_length = max(appt_lengths)

print(min_length)
print(max_length)

2:29:00
16 days, 12:59:00


## Guided Project: Exploring Hacker News Posts

Please see separate project notebook.