## Basic practice in Python

* Variables
* Lists & slicing
* Dictionaries

In [1]:
# Integers, floats, strings, booleans:

x = 1
y = 'Hello world!'  # Strings can be single quoted
y = "Hello worold!" # ... or double quoted

print(x)
x = 'Uh zoom zip'   # Variables are not statically typed
print(x)
y = True   # ... or "False" -- with uppercase "T" in "True" / "F" in "False"
print(y)

1
Uh zoom zip
True


In [2]:
# None is the equivalent of null / NULL in Java / C++, etc.
x = None
x  # Instead of "print(...)", you can invoke the name of (one thing) to print

In [3]:
# Lists are vectors / variable-sized arrays (akin to List in Java):
x = [1, '2nd', 3, True]
x

[1, '2nd', 3, True]

In [4]:
# Index an array (0-based) from the front...
print(x[2]) # <-- 3rd item == 3?

# Index an array from the back with a negative number:
print(x[-3]) # <-- 3rd item from the back == 2nd? 

3
2nd


In [5]:
# Lists are mutable -- change their value at any point
# Tuples look like lists (declared with round brackets) but are immutable
t = (1, '2nd', 3, True)
t

(1, '2nd', 3, True)

In [6]:
t[2] = 13

TypeError: 'tuple' object does not support item assignment

In [7]:
# Lists can be sliced -- get a range from [start... finish]
# Includes "start", does not include "finish"
a = x[1:-1]
a

['2nd', 3]

In [8]:
# If a part of the range is not provided, it is assumed to be "everything":
b = x[-2:]
b

[3, True]

In [9]:
# Lists can be nested -- i.e. list of lists -- like a 2D array
c = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
c

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [10]:
# Some other useful stuff:

# Create an empty list with []:
empty_list = []

# Determine the length of a list with "len()" function:
len(empty_list)

# Add to a list with "append":
print('Before appending [-2, 48] to', x)
x.append([-2, 48])
print('... and after:', x)

Before appending [-2, 48] to [1, '2nd', 3, True]
... and after: [1, '2nd', 3, True, [-2, 48]]


In [11]:
# Closely related, but performed element-by-element is "extend(...)":
x.extend(['penultimate', 'last'])
x

[1, '2nd', 3, True, [-2, 48], 'penultimate', 'last']

In [12]:
# Insert (to_position, item):
x.insert(0, 'Now I\'m first')
x

["Now I'm first", 1, '2nd', 3, True, [-2, 48], 'penultimate', 'last']

In [13]:
# Remove (from_position):
del(x[-1])
x

["Now I'm first", 1, '2nd', 3, True, [-2, 48], 'penultimate']

In [14]:
# Find and remove a value:
x.remove(3) # Do not use "True" here since it has weird effects 
x

["Now I'm first", 1, '2nd', True, [-2, 48], 'penultimate']

In [15]:
# Not sure why you'd need to reverse a list, but you can:
x.reverse()
x

['penultimate', [-2, 48], True, '2nd', 1, "Now I'm first"]

In [16]:
# If all the variable are of the same type...
y = [83, 41, 4, 804, 12]
# You can sort:
print(y)
y.sort()
print(y)

[83, 41, 4, 804, 12]
[4, 12, 41, 83, 804]


In [17]:
# Add two lists to form a 3rd:
z = x + y # x and y are unchanged
z

['penultimate', [-2, 48], True, '2nd', 1, "Now I'm first", 4, 12, 41, 83, 804]

In [18]:
# Create a list __ times larger than:
z = ['Hello'] * 5
z

['Hello', 'Hello', 'Hello', 'Hello', 'Hello']

In [19]:
# Find the smallest / largest value (if the types are the same):
print('Given', y, 'min is:', min(y), '& max is:', max(y))

Given [4, 12, 41, 83, 804] min is: 4 & max is: 804


In [20]:
# First index of __ in list:
z.index('Hello')  # Returns an error if arg not in list

0

In [21]:
# Occurrences of __ in list:
print(z.count('Hello'))
print(x.count('Hello'))

5
0


In [22]:
# Similarly, is __ in the list?
if 'Hello' in x:
    print('Found x welcoming!')
else:
    print('Moved to the next list.')

Moved to the next list.


In [23]:
# Dictionaries are hash tables / associative arrays:
eng_to_french = {}   # Declare with curly braces
eng_to_french['blue'] = 'bleu' # Setting a value
eng_to_french['red'] = 'rouge'
# Access is also easy:
print('In French, \'red\' is', eng_to_french['red'])

In French, 'red' is rouge


In [24]:
# Quick reference:
print('Number of key/value pairs:', len(eng_to_french))
print('All the keys:', eng_to_french.keys())
print('All the values:', eng_to_french.values())
print('All the key/value pairs (as tuples):', eng_to_french.items())
print('Does a key (eg. "orange") exist?', 'orange' in eng_to_french)

print('Set a default value (if it doesn\'t exist):', eng_to_french.get('green', 'verte'))

Number of key/value pairs: 2
All the keys: dict_keys(['blue', 'red'])
All the values: dict_values(['bleu', 'rouge'])
All the key/value pairs (as tuples): dict_items([('blue', 'bleu'), ('red', 'rouge')])
Does a key (eg. "orange") exist? False
Set a default value (if it doesn't exist): verte


In [None]:
# Some other stuff on slides.

## Numpy -- NUMerical PYthon

In [25]:
import numpy as np # Where you see np, you can assume numpy

# Numpy deals with arrays -- homogeneous types, usually numbers
x = np.array([[1, 2, 3], [-99, -98, -97]])
x

array([[  1,   2,   3],
       [-99, -98, -97]])

In [26]:
# How is this different from a list?
y = [[1, 2, 3], [-99, -98, -97]]
y

[[1, 2, 3], [-99, -98, -97]]

In [27]:
# Convert a numpy array to a list with "tolist()"
list_x = x.tolist()
print(x)
print(list_x)

[[  1   2   3]
 [-99 -98 -97]]
[[1, 2, 3], [-99, -98, -97]]


In [28]:
print(x + x)
print(list_x + list_x)

[[   2    4    6]
 [-198 -196 -194]]
[[1, 2, 3], [-99, -98, -97], [1, 2, 3], [-99, -98, -97]]


In [29]:
x.shape # IMPORTANT. (2 rows, 3 columns)

(2, 3)

In [30]:
x.dtype

dtype('int64')

In [31]:
z = np.zeros(4)
z

array([0., 0., 0., 0.])

In [32]:
z = np.arange(10)
z

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [33]:
# Slice array like a list:
z[3:8]

array([3, 4, 5, 6, 7])

In [34]:
# Aggregations on arrays:
print('Mean:', z.mean())
print('Sum:', z.sum())
print('Std. dev.', z.std())

Mean: 4.5
Sum: 45
Std. dev. 2.8722813232690143


# Pandas

In [2]:
import pandas as pd # Any time you see "pd"...
import numpy as np
series = pd.Series(np.arange(5))
series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [36]:
# Note scalar index to the left, value to the right
series1 = pd.Series([10, 9, 8], index = ['colour', 'size', 'wgt'])
series2 = pd.Series([900, 19, 31], index = ['size', 'price', 'r'])
series1

colour    10
size       9
wgt        8
dtype: int64

In [37]:
x = series1 + series2
x

colour      NaN
price       NaN
r           NaN
size      909.0
wgt         NaN
dtype: float64

In [38]:
# DataFrame is the most used Pandas structure;
# Can get one in a few different ways
df1 = pd.DataFrame([[111, 222], ['a', 'b']])
df2 = pd.DataFrame(np.array([[111, 222], ['a', 'b']]))
df3 = pd.DataFrame([pd.Series([111, 222]),pd.Series(['a', 'b'])])
df1

Unnamed: 0,0,1
0,111,222
1,a,b


In [5]:
# Very common to read DataFrame from CSV:
df = pd.read_csv('titanic.csv')
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [6]:
# Slice by name of column:
df.Survived

0      0
1      1
2      1
3      1
4      0
      ..
882    0
883    1
884    0
885    1
886    0
Name: Survived, Length: 887, dtype: int64

In [7]:
# Alternately:
df['Survived']

0      0
1      1
2      1
3      1
4      0
      ..
882    0
883    1
884    0
885    1
886    0
Name: Survived, Length: 887, dtype: int64

In [8]:
# iloc = [row slice, column slice]:
df.iloc[2:7,1:4]

Unnamed: 0,Pclass,Name,Sex
2,3,Miss. Laina Heikkinen,female
3,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female
4,3,Mr. William Henry Allen,male
5,3,Mr. James Moran,male
6,1,Mr. Timothy J McCarthy,male


In [9]:
# loc is common also... uses names:
df.loc[:, 'Name'].head(3) # First 3 names

0                               Mr. Owen Harris Braund
1    Mrs. John Bradley (Florence Briggs Thayer) Cum...
2                                Miss. Laina Heikkinen
Name: Name, dtype: object

In [44]:
# Add a new column (by performing some math on existing column data):
df['age_squared'] = df.Age**2
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,age_squared
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500,484.0
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,1444.0
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250,676.0
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000,1225.0
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500,1225.0
...,...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000,729.0
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000,361.0
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500,49.0
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000,676.0


In [45]:
# Don't want it any more? Remove it with:
df.drop(['age_squared'], axis=1, inplace=True)
# A few other ways to do this, including:
# del df['age_squared']
# df = df.drop(['age_squared'], axis=1)
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [46]:
# Quickly perform column statistics
df.Age.mean()

29.471443066516347

In [47]:
# Survived?
df.Survived.value_counts()

0    545
1    342
Name: Survived, dtype: int64

In [48]:
# How many cells are null?
df.isnull().sum()

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [49]:
# By fare class, determine the mean of other numeric columns
df.groupby('Pclass').mean()

Unnamed: 0_level_0,Survived,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.62963,38.788981,0.416667,0.356481,84.154687
2,0.472826,29.868641,0.402174,0.380435,20.662183
3,0.244353,25.188747,0.620123,0.396304,13.707707
