In [1]:
import numpy as np
import csv

# Dataset
http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_MLB_HeightsWeights


Human Height and Weight are mostly hereditable, but lifestyles (e.g., regular strenuous physical exercise), diet, health and environmental factors also play a role in determining individual's physical characteristics. The dataset below contains 1035 records of heights and weights for some current and recent Major League Baseball (MLB) Players. These data were obtained from different resources (e.g., IBM Many Eyes and the references below). See also the 25,000 records of adolescent height and weight.

The data includes the following variables:

Name: MLB Player Name
Team: The Baseball team the player was a member of at the time the data was acquired
Position: Player field position
Height(inches): Player height in inches
Weight(pounds): Player weight in pounds
Age: Player age at time of record

In [7]:
# Open file and create lists
csv_file = open('baseball.csv', 'r')
height_in = []
weight_p = []
age = []


In [8]:
# Read off and discard first line, to skip headers
csv_file.readline()

# Split columns while reading
for name, team, position, height, weight, age_v, poscategory in csv.reader(csv_file, delimiter=','):
    # Append each variable to a separate list
    height_in.append(float(height))
    weight_p.append(float(weight))
    age.append(float(age_v))

In [4]:
# Create a numpy array from height, weight and age
np_height = np.array(height_in)
np_weight = np.array(weight_p)
np_age = np.array(age)

print(np_height)
print(np_weight) 
print(np_age)


[74. 74. 72. ... 75. 75. 73.]
[180. 215. 210. ... 205. 190. 195.]
[22.99 34.69 30.78 ... 25.19 31.01 27.92]


The height is expressed in inches. Lets use NumPy to convert the units to meters

In [6]:
 # Convert np_height to meter: np_height_m
np_height_m = np_height * 0.0254

# Print np_height_m
print(np_height_m)

[1.8796 1.8796 1.8288 ... 1.905  1.905  1.8542]


In [9]:
# Convert np_weight to kg: np_weight_kg
np_weight_kg = np_weight * 0.453592

# Print np_weight_kg
print(np_weight_kg)

[81.64656 97.52228 95.25432 ... 92.98636 86.18248 88.45044]


In [10]:
# Calculate BMI
bmi = np_weight_kg / np_height_m**2

# Print bmi
print(bmi)

[23.11037639 27.60406069 28.48080465 ... 25.62295933 23.74810865
 25.72686361]


## Subset Lightweight baseball players

In [12]:
# Create the light array
light = bmi < 21

# Print light
print(light)

# Print BMIs of all baseball players below 21
print(bmi[light])

[False False False ... False False False]
[20.54255679 20.54255679 20.69282047 20.69282047 20.34343189 20.34343189
 20.69282047 20.15883472 19.4984471  20.69282047 20.9205219 ]


In [17]:
# lets create a list of lists (Weight and Height)
# We can use the numPy Colum Stack 
np_baseball = np.column_stack((np_height, np_weight, np_age))

# Print np_baseball type
print(type(np_baseball))

# Print the shape
print(np_baseball.shape)

# Print np_baseball
print(np_baseball)

<class 'numpy.ndarray'>
(1015, 3)
[[74.      81.64656 22.99   ]
 [74.      97.52228 34.69   ]
 [72.      95.25432 30.78   ]
 ...
 [75.      92.98636 25.19   ]
 [75.      86.18248 31.01   ]
 [73.      88.45044 27.92   ]]


In [18]:
# Tranning Np Slice
# Print out the 50th row of np_baseball
print(np_baseball[49,:])

# Select the entire second column of np_baseball: np_weight
np_weight = np_baseball[:,1]

# Print out height of 124th player
print(np_baseball[123, 0])

[70.      88.45044 30.69   ]
75.0


In [21]:
# Lets calculate BMi wisely
conversion = np.array([0.0254, 0.453592, 1])

# Print product of np_baseball and conversion
np_baseball = np_baseball * conversion
print(np_baseball)

[[ 1.8796     37.03422644 22.99      ]
 [ 1.8796     44.23532603 34.69      ]
 [ 1.8288     43.20659752 30.78      ]
 ...
 [ 1.905      42.17786901 25.19      ]
 [ 1.905      39.09168347 31.01      ]
 [ 1.8542     40.12041198 27.92      ]]


In [25]:
# lets analyze the data
# Print the mean of np_height
avg = np.mean(np_baseball[:,0])
print("Average: " + str(avg))

# Print median of np_height
med = np.mean(np_baseball[:,0])
print("Median: " + str(med))

# Print the standard deviation on height
stddev = np.std(np_baseball[:,0])
print("Standard Deviation: " + str(stddev))

# Print correlation between firt and second column
corr = np.corrcoef(np_baseball[:,0], np_baseball[:,1])
print("Correlation: " + str(corr))

Average: 1.8717172413793102
Median: 1.8717172413793102
Standard Deviation: 0.05874491377858227
Correlation: [[1.         0.53153932]
 [0.53153932 1.        ]]


# Lets see some FIFA data

The median height of goalkeepers is higher than that of other players on the soccer field?

In [68]:
# Open file and create lists
csv_file = open('fifa.csv', 'r', encoding='utf-8')
positions = []
fifa_heights = []

In [69]:
# Read off and discard first line, to skip headers
csv_file.readline()

# Split columns while reading
for id, name, rating, position, height, foot, rare, pace, shooting, passing, dribbling, defending, heading, diving, handling, kicking, reflexes, speed, positioning in csv.reader(csv_file, skipinitialspace=True, delimiter=','):
    # Append each variable to a separate list
    positions.append(position)
    fifa_heights.append(float(height))



In [72]:
# Convert positions and heights to numpy arrays: np_positions, np_heights
np_positions = np.array(positions)
np_fifa_heights = np.array(fifa_heights)


# Heights of the goalkeepers: gk_heights
#np_positions == 'GK'
gk_heights = np_fifa_heights[np_positions == 'GK']


# Heights of the other players: other_heights
other_heights = np_fifa_heights[np_positions != 'GK']

# Print out the median height of goalkeepers. Replace 'None'
print("Median height of goalkeepers: " + str(np.median(gk_heights)))

# Print out the median height of other players. Replace 'None'
print("Median height of other players: " + str(np.median(other_heights)))

Median height of goalkeepers: 188.0
Median height of other players: 181.0


Yes! The Goalkeepers are taller than others players