### **Data Manipulation in Python**

### Reading data from CSV files using csv package

In [None]:
# Set-up
import csv

In [None]:
# Upload initial data files
# Choose files Cities.csv and Countries.csv - must be on local computer
# If running notebook on local computer:
#   No need to run this cell (it will generate an error)
#   Make sure data files are in same workspace as notebook
from google.colab import files
uploaded = files.upload()

Saving Cities.csv to Cities.csv
Saving Countries.csv to Countries.csv


In [None]:
# Use csv package 'DictReader' to read Cities.csv data
# After header, data is read row-by-row into dictionary format
# Note all values are read as strings
with open('Cities.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        print(dict(r))

In [None]:
# Print the city and longitude of all cities with longitude < 0
with open('Cities.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if float(r['longitude']) < 0:
            print(r['city'], r['longitude'])
# Show what happens without float()

Aberdeen -2.08
Albacete -1.87
Algeciras -5.47
Angers -0.53
Badajoz -6.97
Belfast -5.96
Bilbao -2.93
Birmingham -1.92
Blackpool -3.05
Bordeaux -0.60
Bournemouth -1.90
Bradford -1.75
Braga -8.42
Brest -4.50
Burgos -3.68
Caen -0.35
Cartagena -0.98
Cork -8.50
Dublin -6.25
Dundee -3.00
Edinburgh -3.22
Exeter -3.53
Galway -9.05
Glasgow -4.25
Granada -3.59
Huelva -6.93
Inverness -4.23
Lisbon -9.14
Madrid -3.68
Marbella -4.88
Murcia -1.13
Oviedo -5.83
Salamanca -5.67
Santander -3.80
Swansea -3.95
Valencia -0.40
Vigo -8.73
Zaragoza -0.89


### <font color="green">**Your Turn**</font>

In [None]:
# Using csv package 'DictReader' to read Countries.csv data,
# find all countries that have coastline and are not in the EU;
# print the countries and their populations
# Note: In Python, use '==' to test equality
with open('Countries.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if (r['coastline']) == 'yes' and r['EU'] == 'no':
            print(r['country'], r['population'])


Albania 2.9
Bosnia and Herzegovina 3.8
Iceland 0.33
Montenegro 0.63
Norway 5.27
Turkey 79.62
Ukraine 44.62


### Reading data into Python data structures

In [None]:
# Read Cities.csv data into list of dictionaries
cities = []
with open('Cities.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        cities.append(dict(r))
cities

In [None]:
# Read Countries.csv data into list of dictionaries
countries = []
with open('Countries.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        countries.append(dict(r))
countries

[{'country': 'Albania', 'population': '2.9', 'EU': 'no', 'coastline': 'yes'},
 {'country': 'Andorra', 'population': '0.07', 'EU': 'no', 'coastline': 'no'},
 {'country': 'Austria', 'population': '8.57', 'EU': 'yes', 'coastline': 'no'},
 {'country': 'Belarus', 'population': '9.48', 'EU': 'no', 'coastline': 'no'},
 {'country': 'Belgium',
  'population': '11.37',
  'EU': 'yes',
  'coastline': 'yes'},
 {'country': 'Bosnia and Herzegovina',
  'population': '3.8',
  'EU': 'no',
  'coastline': 'yes'},
 {'country': 'Bulgaria', 'population': '7.1', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Croatia', 'population': '4.23', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Cyprus', 'population': '1.18', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Czech Republic',
  'population': '10.55',
  'EU': 'yes',
  'coastline': 'no'},
 {'country': 'Denmark', 'population': '5.69', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Estonia', 'population': '1.31', 'EU': 'yes', 'coastline': 'yes'},
 {'countr

In [None]:
# Print the city and longitude of all cities with longitude < 0
for city in cities:
    if float(city['longitude']) < 0:
        print(city['city'], city['longitude'])

Aberdeen -2.08
Albacete -1.87
Algeciras -5.47
Angers -0.53
Badajoz -6.97
Belfast -5.96
Bilbao -2.93
Birmingham -1.92
Blackpool -3.05
Bordeaux -0.60
Bournemouth -1.90
Bradford -1.75
Braga -8.42
Brest -4.50
Burgos -3.68
Caen -0.35
Cartagena -0.98
Cork -8.50
Dublin -6.25
Dundee -3.00
Edinburgh -3.22
Exeter -3.53
Galway -9.05
Glasgow -4.25
Granada -3.59
Huelva -6.93
Inverness -4.23
Lisbon -9.14
Madrid -3.68
Marbella -4.88
Murcia -1.13
Oviedo -5.83
Salamanca -5.67
Santander -3.80
Swansea -3.95
Valencia -0.40
Vigo -8.73
Zaragoza -0.89


In [None]:
# Print all cities that are not in the EU
# Requires joining cities and countries
for city in cities:
    for country in countries:
        if city['country'] == country['country'] and country['EU'] == 'no':
            print(city['city'], '-', city['country'])

Adana - Turkey
Andorra - Andorra
Ankara - Turkey
Antalya - Turkey
Balti - Moldova
Basel - Switzerland
Batman - Turkey
Belgrade - Serbia
Bergen - Norway
Bila Tserkva - Ukraine
Bodo - Norway
Brest - Belarus
Bursa - Turkey
Cherkasy - Ukraine
Chernihiv - Ukraine
Chernivtsi - Ukraine
Chisinau - Moldova
Denizli - Turkey
Edirne - Turkey
Elbasan - Albania
Erzincan - Turkey
Erzurum - Turkey
Eskisehir - Turkey
Gaziantep - Turkey
Geneva - Switzerland
Horlivka - Ukraine
Hrodna - Belarus
Istanbul - Turkey
Karaman - Turkey
Kayseri - Turkey
Kherson - Ukraine
Kiev - Ukraine
Kremenchuk - Ukraine
Kryvyy Rih - Ukraine
Lvov - Ukraine
Makiyivka - Ukraine
Malatya - Turkey
Manisa - Turkey
Mazyr - Belarus
Minsk - Belarus
Nis - Serbia
Novi Sad - Serbia
Ordu - Turkey
Orsha - Belarus
Oslo - Norway
Pinsk - Belarus
Podgorica - Montenegro
Rivne - Ukraine
Samsun - Turkey
Sarajevo - Bosnia and Herzegovina
Siirt - Turkey
Sivas - Turkey
Skopje - Macedonia
Stavanger - Norway
Sumy - Ukraine
Tarsus - Turkey
Tekirdag - Tur

### Aggregation

In [None]:
import numpy as np

In [None]:
# Compute overall average city temperature
temps = [] # create list of all temperatures
for city in cities:
    temps.append(float(city['temperature']))
# print(temps)
print(np.average(temps))

9.497840375586854


In [None]:
# Alternative using running sum and count
sum = 0
count = 0
for city in cities:
    sum += float(city['temperature'])
    count += 1
print(sum/count)

9.497840375586858


In [None]:
# Compute average city temperature for each country
# First compute list of countries
countryList = []
for city in cities:
    if city['country'] not in countryList:
        countryList.append(city['country'])
# print(countryList)
# Then compute average temperature for each
for country in countryList:
    temps = []
    for city in cities:
        if city['country'] == country:
            temps.append(float(city['temperature']))
    print(country, np.average(temps))

Denmark 7.625
United Kingdom 8.649999999999999
Sweden 3.5866666666666673
Turkey 11.726666666666667
Spain 14.238333333333332
France 10.151111111111112
Netherlands 8.756666666666668
Italy 13.474666666666668
Andorra 9.6
Romania 9.224444444444444
Greece 16.9025
Germany 7.8692857142857155
Moldova 8.415
Switzerland 7.253333333333333
Serbia 9.85
Norway 3.7260000000000004
Poland 7.25
Ukraine 7.420000000000001
Portugal 14.469999999999999
Slovakia 8.48
Belarus 5.946666666666666
Czech Republic 7.8566666666666665
Belgium 9.65
Hungary 9.6025
Bulgaria 10.44
Ireland 9.299999999999999
Latvia 5.27
Albania 15.18
Austria 6.144
Finland 3.4875
Lithuania 6.1433333333333335
Slovenia 9.27
Montenegro 9.99
Croatia 10.865
Bosnia and Herzegovina 9.6
Macedonia 9.36
Estonia 4.59


In [None]:
# Compute overall minimum and maximum city temperatures
temps = []
for city in cities:
    temps.append(float(city['temperature'])) 
print('Minimum:', min(temps))
print('Maximum:', max(temps))

Minimum: -2.2
Maximum: 18.67


In [None]:
# Alternative method using running min and max
minval = 100 # greater than any possible minimum
maxval = 0   # less than any possible maximum
for city in cities:
    if float(city['temperature']) < minval:
        minval = float(city['temperature'])
    if float(city['temperature']) > maxval:
        maxval = float(city['temperature'])
print('Minimum:', minval)
print('Maximum:', maxval)

### <font color="green">**Your Turn**</font>

In [None]:
# Find the minimum, maximum, and average temperatures of
# cities that are in the EU, and the minimum, maximum, and average
# temperatures of cities that are not in the EU
#
# Hint: You will need to "join" cities and countries using one loop inside
#   another as seen in an earlier example
# Then create two lists of temperatures:
EU = [] # temperatures of EU cities
nonEU = [] # temperatures of non-EU cities
#
#YOUR CODE TO POPULATE THE LISTS GOES HERE (can be done in 7 lines)
cities = []
for city in cities:
  for country in countries:
    if city['country'] == country['country']:
      if country['EU'] == 'yes':
        EU.append(float(city['temperature']))
        else:
          nonEU.append(float(city['temperature']))

#
# Once the lists are populated, the following code prints the results
print('EU:    ', 'minimum', min(EU), 'maximum', max(EU), 'average', np.average(EU))
print('non-EU:', 'minimum', min(nonEU), 'maximum', max(nonEU), 'average', np.average(nonEU))

SyntaxError: ignored

### <font color="green">**Your Turn: World Cup Data**</font>

In [None]:
# Upload world cup data files
# Choose files Players.csv and Teams.csv - must be on local computer
from google.colab import files
uploaded = files.upload()

Saving Players.csv to Players.csv
Saving Teams.csv to Teams.csv


In [None]:
# Read Players.csv and Teams.csv into lists of dictionaries
players = []
with open('Players.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        players.append(dict(r))
teams = []
with open('Teams.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        teams.append(dict(r))

In [None]:
# Show first 5 items in players list
players[:5]

[{'surname': 'Abdoun',
  'team': 'Algeria',
  'position': 'midfielder',
  'minutes': '16',
  'shots': '0',
  'passes': '6',
  'tackles': '0',
  'saves': '0'},
 {'surname': 'Belhadj',
  'team': 'Algeria',
  'position': 'defender',
  'minutes': '270',
  'shots': '1',
  'passes': '146',
  'tackles': '8',
  'saves': '0'},
 {'surname': 'Boudebouz',
  'team': 'Algeria',
  'position': 'midfielder',
  'minutes': '74',
  'shots': '3',
  'passes': '28',
  'tackles': '1',
  'saves': '0'},
 {'surname': 'Bougherra',
  'team': 'Algeria',
  'position': 'defender',
  'minutes': '270',
  'shots': '1',
  'passes': '89',
  'tackles': '11',
  'saves': '0'},
 {'surname': 'Chaouchi',
  'team': 'Algeria',
  'position': 'goalkeeper',
  'minutes': '90',
  'shots': '0',
  'passes': '17',
  'tackles': '0',
  'saves': '2'}]

In [None]:
# Show first 5 items in teams list
teams[:5]

[{'team': 'Brazil',
  'ranking': '1',
  'games': '5',
  'wins': '3',
  'draws': '1',
  'losses': '1',
  'goalsFor': '9',
  'goalsAgainst': '4',
  'yellowCards': '7',
  'redCards': '2'},
 {'team': 'Spain',
  'ranking': '2',
  'games': '6',
  'wins': '5',
  'draws': '0',
  'losses': '1',
  'goalsFor': '7',
  'goalsAgainst': '2',
  'yellowCards': '3',
  'redCards': '0'},
 {'team': 'Portugal',
  'ranking': '3',
  'games': '4',
  'wins': '1',
  'draws': '2',
  'losses': '1',
  'goalsFor': '7',
  'goalsAgainst': '1',
  'yellowCards': '8',
  'redCards': '1'},
 {'team': 'Netherlands',
  'ranking': '4',
  'games': '6',
  'wins': '6',
  'draws': '0',
  'losses': '0',
  'goalsFor': '12',
  'goalsAgainst': '5',
  'yellowCards': '15',
  'redCards': '0'},
 {'team': 'Italy',
  'ranking': '5',
  'games': '3',
  'wins': '0',
  'draws': '2',
  'losses': '1',
  'goalsFor': '4',
  'goalsAgainst': '5',
  'yellowCards': '5',
  'redCards': '0'}]

In [None]:
# What player on a team with “ia” in the team name played less than
# 200 minutes and made more than 100 passes? Print the player surname.
# Note: In Python, use "'abc' in s" to check whether string s contains'abc'
# Reminder: Convert minutes and passes to integers before comparing to values
#YOUR CODE HERE
# Iterate over players and find player who meets criteria
for player in players:
    if 'ia' in player['team'] and int(player['minutes']) < 200 and int(player['passes']) > 100:
        print(player['surname'])
        

Kuzmanovic


In [None]:
# What is the average number of passes made by forwards? By midfielders?
# Make sure to label which is which.
#YOUR CODE HERE
forwards_count = 0
forwards_passes = 0
midfielders_count = 0
midfielders_passes = 0

for player in players:
    if player['position'] == 'forward':
        forwards_count += 1
        forwards_passes += int(player['passes'])
    elif player['position'] == 'midfielder':
        midfielders_count += 1
        midfielders_passes += int(player['passes'])

# Check for zero division before computing averages
if forwards_count == 0:
    forwards_average = 0
else:
    forwards_average = forwards_passes / forwards_count
    
if midfielders_count == 0:
    midfielders_average = 0
else:
    midfielders_average = midfielders_passes / midfielders_count

print("Average number of passes made by forwards:", forwards_average)
print("Average number of passes made by midfielders:", midfielders_average)



Average number of passes made by forwards: 50.82517482517483
Average number of passes made by midfielders: 95.2719298245614


In [None]:
# Which team has the highest ratio of goalsFor to goalsAgainst?
# Print the team only.
# Reminder: Use float() to make sure you're doing floating point division
# Hint: Use two variables to keep track of highest ratio seen so far
# and team with that ratio:
ratio = 0 # highest ratio seen so far
ratioteam = '' # team with highest ratio
#YOUR CODE HERE
for team in teams:
    goals_for = float(team['goalsFor'])
    goals_against = float(team['goalsAgainst'])
    current_ratio = goals_for / goals_against
  
    if current_ratio > ratio:
        ratio = current_ratio
        ratioteam = team['team']


print("Team with highest ratio :", ratioteam)


Team with highest ratio : Portugal


In [None]:
# How many players who play on a team with ranking <10 played
# more than 350 minutes?
# Reminder: Convert ranking and minutes to integers before comparing to values
# Hint: Compute join of players and teams, using a variable to count number of
# players satisfying requirement
# Initialize count variable
count = 0
for player in players:
    for team in teams:
        # Check if player is on a team with ranking < 10 and has played more than 350 minutes
        if player['team'] == team['team'] and int(team['ranking']) < 10 and int(player['minutes']) > 350:
            count += 1

# Print count
print("Number of players:", count)



Number of players: 54


### <font color="green">**Your Turn Extra: Titanic Data**</font>

In [None]:
# Upload Titanic data file
# Choose file Titanic.csv - must be on local computer
from google.colab import files
uploaded = files.upload()

In [None]:
# Read Titanic.csv into list of dictionaries
titanic = []
with open('Titanic.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        titanic.append(dict(r))

In [None]:
# Show first 5 items in titanic list
titanic[:5]

In [None]:
# How many married women over the specified age threshold embarked in Cherbourg?
# Make sure to try different thresholds.
# Note: In Python, use "'abc' in s" to check whether string s contains 'abc'
# Note: You will need to account for the fact that some ages are the empty string ''
# Reminder: Convert non-blank ages to float before comparing to a value
age_threshold = 50
YOUR CODE HERE

In [None]:
# What is the average fare paid by passengers in the three classes, and the
# average age of passengers in the three classes (ignoring missing ages)?
YOUR CODE HERE

In [None]:
# Find the survival rate for passengers in the three different classes,
# i.e., what fraction of passengers in each class survived? Also find
# the survival rate for males versus females, and for children (age < 18)
# versus adults (age >= 18) ignoring passengers whose age is missing.
YOUR CODE HERE

In [None]:
# Find pairs of passengers who are likely to be twin children: same
# last name, same age, same embarkation, and age is under 18. Print
# each pair once, including their names, age, and mbarkation city.
YOUR CODE HERE