# <span style="color:darkblue"> Lecture 11: Application 2 - Random Assignment </span>

<font size = "5">



# <span style="color:darkblue"> I. Import Libraries and Data </span>


In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
carfeatures = pd.read_csv("data_raw/features.csv")

In [14]:
carfeatures

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,vehicle_id
0,18.0,8,307,130,3504,12.0,C-1689780
1,15.0,8,350,165,3693,11.5,B-1689791
2,18.0,8,318,150,3436,11.0,P-1689802
3,16.0,8,304,150,3433,12.0,A-1689813
4,17.0,8,302,140,3449,10.5,F-1689824
...,...,...,...,...,...,...,...
393,27.0,4,140,86,2790,15.6,F-1694103
394,44.0,4,97,52,2130,24.6,V-1694114
395,32.0,4,135,84,2295,11.6,D-1694125
396,28.0,4,120,79,2625,18.6,F-1694136


# <span style="color:darkblue"> I. Random Assignment </span>

<font size = "5">

Random assignment is crucial for scientific progress ...

- The basis for medical trials
- Also used in engineering, the natural sciences and <br>
  social sciences (economics, political science, etc.)


In [15]:
# "list_status" is a list with "treatment/control" arms
# "prop_status" is the proportion in the treatment and control arms
# "size_dataset" is how many rows are contained

list_status  = ["Treatment","Control"]
prop_status  = [0.4,0.6]
size_dataset = len(carfeatures)

In [16]:
size_dataset

398

<font size = "5">
Random assignment


In [17]:
# The "np.random.choice" will create a vector with the status
# We will save this to a column in "carfeatures"
# Note: (i) We can always split the arguments of a function in multiple lines
#           to make it easier to read

np.random.seed(42)
carfeatures["status"] = np.random.choice(list_status,
                                         size = size_dataset,
                                         p = prop_status)

display(carfeatures)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,vehicle_id,status
0,18.0,8,307,130,3504,12.0,C-1689780,Treatment
1,15.0,8,350,165,3693,11.5,B-1689791,Control
2,18.0,8,318,150,3436,11.0,P-1689802,Control
3,16.0,8,304,150,3433,12.0,A-1689813,Control
4,17.0,8,302,140,3449,10.5,F-1689824,Treatment
...,...,...,...,...,...,...,...,...
393,27.0,4,140,86,2790,15.6,F-1694103,Control
394,44.0,4,97,52,2130,24.6,V-1694114,Treatment
395,32.0,4,135,84,2295,11.6,D-1694125,Control
396,28.0,4,120,79,2625,18.6,F-1694136,Control


<font size = "5">

Compute frequencies by status

In [18]:
# The command "pd.crosstab" computes frequencies
# If we add the option "normalize" it will compute proportions
# Note: The default assignment is done randomly without replacement
#       which means that the proportions are approximately the same   
#       (but not equal) to "prop_status"

frequency_table   = pd.crosstab(index = carfeatures["status"], columns = "Frequency")
proportions_table = pd.crosstab(index = carfeatures["status"],
                                columns = "Frequency",
                                normalize = True)

display(frequency_table)
display(proportions_table)


col_0,Frequency
status,Unnamed: 1_level_1
Control,230
Treatment,168


col_0,Frequency
status,Unnamed: 1_level_1
Control,0.577889
Treatment,0.422111


<font size = "5">

Query with string conditions

In [19]:
# When you have queries for text variables, it's important
# to use outer ' ' single quotations
# and inner double quotations.

data_treated = carfeatures.query('status == "Treatment" ')
data_control = carfeatures.query('status == "Control" ')

<font size = "5">

Treated/control should be similar

- This is the key principle of random assignment
- We can check the summary statistics

In [20]:
# The count is different because we assigned different proportions
# All other sumary statistics are approximately the same
# They are not identical because the assignment is random

display(data_treated.describe())
display(data_control.describe())

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration
count,168.0,168.0,168.0,168.0,168.0
mean,23.865476,5.255952,181.363095,2867.833333,15.885119
std,7.507728,1.681164,99.50918,799.502747,2.859727
min,10.0,3.0,68.0,1649.0,9.0
25%,18.0,4.0,98.0,2211.75,14.0
50%,23.8,4.0,140.0,2697.0,15.8
75%,29.0,6.0,250.0,3421.25,17.5
max,44.6,8.0,455.0,5140.0,24.8


Unnamed: 0,mpg,cylinders,displacement,weight,acceleration
count,230.0,230.0,230.0,230.0,230.0
mean,23.258261,5.6,202.23913,3045.36087,15.336522
std,8.040199,1.704348,106.967118,873.926168,2.663167
min,9.0,4.0,71.0,1613.0,8.0
25%,16.5,4.0,107.0,2257.25,13.625
50%,22.0,6.0,156.0,2900.5,15.4
75%,29.375,8.0,302.0,3728.75,16.975
max,46.6,8.0,455.0,4997.0,23.7


## <span style="color:darkblue"> III. Quiz Structure </span>

<font size = "5">

The day of the quiz I will ...
- Provide a dataset with information
- Give more specific instructions.
- Below, you will see the type of questions that will be asked.
- The idea is for you to apply known concepts to new data
- You have 50 minutes to complete the assignment

Questions

(exact wording may change in quiz, but exercise will be very similar)


<font size = "5">

(a) Create a function and apply it to a column

- Check Lecture 8 for how to define a function
- The function will have if/else statements and output a string
- You will use ".apply()" to create a new variable in the dataset <br>
(see Lecture 9)

In [None]:
carfeatures = pd.read_csv("data_raw/features.csv")
def car_func(weight):
    if 4000 < weight < 5000:
        return 'Large'
    elif 3000 < weight < 4000:
        return 'Midsize'
    else:
        return 'Small'

# Apply the function to create a new column "car_condition"
carfeatures['car_size'] = carfeatures['weight'].apply(car_func)

print(carfeatures)

<font size = "5">

(b) Use queries + global variables

- You will be asked to compute certain summary statistics <br>
(mean, median, etc)
- The query will have multiple conditions
- Then subset a dataset that meets certain conditions
- See Lecture 10 for more details

In [35]:
type(carfeatures["horsepower"])

pandas.core.series.Series

In [40]:
min_mpg = 20
max_displace = 300
# Use .query() to compute summary statistics according to conditions
#query_conditions = "horsepower >= {min_hrspwr} and displacement <= {max_displace}"
subset_1 = carfeatures.query("(mpg >= @min_mpg) and (displacement <= @max_displace)")

# Calculate the mean and median of mileage for the subset
mean_mileage = subset_1['mpg'].mean()
median_mileage = subset_1['mpg'].median()

<font size = "5">

(c) Use sorting + ".iloc[]"

- Extract the observations with the largest values of a column
- See Lecture 10 for details

In [41]:
carsorted = carfeatures.sort_values(by = "mpg", ascending = False)
carsorted.iloc[[0, 1, 2]]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,vehicle_id,car_size
322,46.6,4,86,65,2110,17.9,M-1693322,Small
329,44.6,4,91,67,1850,13.8,H-1693399,Small
325,44.3,4,90,48,2085,21.7,V-1693355,Small


<font size = "5">

(d) Split a dataset into subsets

- You will be asked to randomly assign a status to each row
- Split the data into separate datasets using ".query()"
- This will closely follow the material in Lecture 11 (this one)
- You will need this result to answer questions (e), (f)


In [None]:
list_status  = ["Treatment","Control"]
prop_status  = [0.4,0.6]
size_dataset = len(carfeatures)

np.random.seed(42)
carfeatures["status"] = np.random.choice(list_status,
                                         size = size_dataset,
                                         p = prop_status)

data_treated = carfeatures.query('status == "Treatment" ')
data_control = carfeatures.query('status == "Control" ')

<font size = "5">

(e) Create a function with four inputs $f(y,x,b0,b1)$

- Start by using "def" to define the function
- The function will include arithmetic operations (Lecture 3) <br>
and summary statistics for pandas (mean, std, min, max, etc.)
- You will be asked to test different values of $(y,x,b0,b1)$
- You will get $y$ and $x$ from the two datasets in part (d)
- Note: You will **not** be required to use the "statsmodels" library


In [44]:
def calculate_statistics(y, x, b0, b1):
    # Calculate the predicted values using the linear regression formula
    y_pred = b0 + b1 * data_control
    
    # Calculate the residuals
    residuals = data_treated - y_pred
    
    # Create a DataFrame to store the relevant statistics
    stats = pd.DataFrame({'Y': data_treated, 'X': data_control, 'Y_pred': y_pred, 'Residuals': residuals})
    
    # Calculate summary statistics using pandas
    summary_stats = stats.describe()
    
    return summary_stats


<font size = "5">

(f) Create two overlapping histogram plots

- You will use a variable from the two datasets in (d)
- You need to use the "alpha" option to make the graphs semitransparent
- You will need to add a legend, label the axes, and the title
- Note: The goal of this question is to illustrate that random <br>
assignment produces very similar distributions between two groups

In [None]:
plt.hist(urban[urban["year"] == 1980] ["prop_urbanpopulation"], alpha = 0.5, label = "1980");
plt.hist(urban[urban["year"] == 2020] ["prop_urbanpopulation"], alpha = 0.5, label = "2020");
plt.xlabel("Urban Population Proportion")
plt.ylabel("Frequency")
plt.title("Proportion of the Urban Population in 1980 and 2020")
plt.legend()
plt.show()