# Prediction Process and Models

In [None]:
# Imports

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from statistics import mean
import statsmodels.formula.api as smf

# Task 1: Data Prepration

In [None]:
# Convert data.txt to a pandas dataframe
# pre_df is the raw data from data.txt without organization

pre_df = pd.read_csv("data.txt", sep="|", header=None, names=["ID", "Variable", "Value"])
pre_df = pre_df.drop(pre_df.index[0]) # Drop original headers
pre_df = pre_df.drop(pre_df.index[894]) # Drop line with erroneous text

pre_df.head()

In [None]:
# Data fields are i, a, h, w, and score: need 5 columns
# Need (len(df) / 5) rows = 2170 / 5 = 434

cols = 5
rows = len(pre_df) // cols
f'Rows needed: {rows}'

In [None]:
# Create 434x5 dataframe and populate it with contents of pre_df

df = pd.DataFrame()

for r in range(rows):
    curr_row = {
        "child_iq": pre_df.iloc[r+rows*0]['Value'],
        "mother_age": pre_df.iloc[r+rows*4]['Value'],
        "mother_hs": pre_df.iloc[r+rows*1]['Value'],
        "mother_iq": pre_df.iloc[r+rows*2]['Value'],
        "mother_work": pre_df.iloc[r+rows*3]['Value']
    }
    
    df = df.append(curr_row, ignore_index=True)

df.head()

# Task 2: Data Transformation

In [None]:
# Check for null values

for c in df.columns:
    print(f'Column \"{c}\" has {df[c].isnull().sum()} null values.')

# There are no null values in this dataframe, so there is no need to handle null values

In [None]:
# Change some fields from string to float

df['mother_iq'] = df['mother_iq'].str.replace("'","") # Remove apostrophes
df['mother_iq'] = df['mother_iq'].astype(float) # Convert to float

df['child_iq'] = df['child_iq'].astype(float)
df['mother_age'] = df['mother_age'].astype(float)

df.head()

In [None]:
# Make a dataframe for normalized data so that numerical values (child IQ, mother age, mother IQ) fall between 0 and 1
# This dataframe will be used for creating prediction models

df_original = df.copy() # This will be used for some exploratory data analysis

mm_scaler = MinMaxScaler()
df[['child_iq', 'mother_age', 'mother_iq']] = mm_scaler.fit_transform(df[['child_iq', 'mother_age', 'mother_iq']])

# Convert mother education and mother workload to type string
df['mother_hs'] = df['mother_hs'].astype(str)
df['mother_work'] = df['mother_work'].astype(str)

df.head()

# Task 1 and 2 Comments:

- Created dataframe `pre_df` and then transformed it into `df`
    - Loaded raw data and then organized it into intended columns
    - Given 2170 valid data entries and 5 columns, created 434 rows
    - Rows were created by treating each 434 data entries as a column
        - For example: the first row used indices 0, 434, 868, 1302, and 1736; the second used indices 1, 435, 869, 1303, and 1737; and etc.
- Original variables were renamed for readability
- Child IQ (`child_iq`), mother IQ (`mother_iq`), and mother age (`mother_age`) were normalized to be floats between 0-1 (to be used in task 4)
- Mother education (`mother_hs`) and mother workload (`mother_work`) were converted to strings to be used as categorical variables
- Checked for null values just in case

# Task 3: Exploratory Data Analysis

In [None]:
# Get boxplots of child IQ's, mother IQ's, and mother ages

boxplot = df_original.boxplot(column=['child_iq', 'mother_age', 'mother_iq'])
boxplot.set_title('Child IQ, Mother IQ, and Mother Age Boxplots')

In [None]:
def print_stats(col_name):
    print(col_name, "median:", df_original[col_name].median())
    print(col_name, "mean:", df_original[col_name].mean())
    print(col_name, "range:", df_original[col_name].max() - df_original[col_name].min(), "\n")

print_stats('child_iq')
print_stats('mother_iq')
print_stats('mother_age')

- Children in this dataset have a wide range of IQ's, outliers skew the mean below the median
- Mothers in this dataset have the smaller range of IQ's, on average higher than their children
- Mothers have an average age of 23, range is 17 to 29

In [None]:
# Observe child IQ distribution, 40 bins

ax = sns.displot(df_original['child_iq'], kde=False, bins=40)
ax.set(xlabel='Child IQ', ylabel='Number of Children', title='Child IQ Distribution')

In [None]:
# Compare average child IQ based on if a mother graduated high school

# Create bar graph
x = [0.5, 1.5]
y = df_original.groupby('mother_hs')['child_iq'].mean()

fig, ax = plt.subplots()

bars = ax.bar(x, y, width=0.5)
ax.set(xlim=(0, 2), ylim=(0, 100))
plt.xticks(x, ['No high school education', 'High school education'])
ax.set_ylabel('Child IQ')
ax.set_title('Average Child IQ grouped by Mother Education')

bars[0].set_color('r')
bars[1].set_color('b')

plt.show()

Average child IQ is higher with mothers who graduated high school versus those who did not graduate high school.

In [None]:
# Compare mother age and child IQ

df_original.groupby('mother_age')['child_iq'].mean().plot.bar(xlabel='Mother Age', ylabel='Child IQ', title='Average Child IQ by Mother Age')

This graph shows no significant trend between a mother's age and her child's IQ.

In [None]:
# Create new column, categorize mother age into 3 bins and compare to child IQ

bin_range = int(df_original['mother_age'].max() - df_original['mother_age'].min())
bins = list(range(int(df_original['mother_age'].min()), int(df_original['mother_age'].max())+1, bin_range // 3))

labels = [0, 0.5, 1]

df_original['mother_age_group'] = pd.cut(df_original['mother_age'], bins=bins, labels=labels, right=False)
df['mother_age_group'] = df_original['mother_age_group']

x = [0.5, 1.5, 2.5]
y = df_original.groupby('mother_age_group')['child_iq'].mean()

fig, ax = plt.subplots()

bars = ax.bar(x, y, width=0.5)
ax.set(xlim=(0, 3), ylim=(0, 100))
plt.xticks(x, ['Age 17 to 20', 'Age 21 to 24', 'Age 25 to 29'])
ax.set_xlabel('Mother\'s Age Group')
ax.set_ylabel('Child IQ')
ax.set_title('Average Child IQ by Mother Age Group')

bars[0].set_color('y')
bars[1].set_color('orange')
bars[2].set_color('r')

plt.show()

By categorizing mother age, it seems that child IQ increases slightly based on age group of the mother.

In [None]:
# Compare average child IQ based on mother workload

# Create bar graph
x = [0.5, 1.5, 2.5, 3.5]
y = df_original.groupby('mother_work')['child_iq'].mean()

fig, ax = plt.subplots()

bars = ax.bar(x, y, width=0.5)
ax.set(xlim=(0, 4), ylim=(0, 100))
plt.xticks(x, ['Type 1', 'Type 2', 'Type 3', 'Type 4'])
ax.set_xlabel('Mother\'s Workload (See types below)')
ax.set_ylabel('Child IQ')
ax.set_title('Average Child IQ based on Mother\'s Workload')

bars[0].set_color('r')
bars[1].set_color('b')
bars[2].set_color('g')
bars[3].set_color('y')

plt.show()

#### Types:

- Type 1: "Mother did not work in first three years of child's life"
- Type 2: "Mother worked in second or third year of child's life"
- Type 3: "Mother worked part-time in first year of child's life"
- Type 4: "Mother worked full-time in first year of child's life"

It seems that the workload of a mother does not have a noticeable effect on the IQ of her child.

In [None]:
# Education and workload, create a categorical column using data from both

df['mother_edu_work'] = (df_original['mother_work'].astype(float) - df_original['mother_hs'].astype(float)) 

# Convert to string
df['mother_edu_work'] = df['mother_edu_work'].astype(str)

# Examples:
# Type 4 (4) - no high school (0) = 4 (highest)
# Type 1 (1) - high school (1) = 0 (lowest)

In [None]:
# Compare mother IQ and child IQ

# Plot data
plt_iq = df_original.groupby('mother_iq')['child_iq'].mean().plot(xlabel='Mother IQ', ylabel='Child IQ', title='Average Child IQ based on Mother IQ')

# Add trend line
x = plt_iq.lines[0].get_xdata()
y = plt_iq.lines[0].get_ydata()

fit = np.polyfit(x, y, 1)
line = np.poly1d(fit)

plt_iq.plot(x, line(x))

Generally, as mother IQ increases, so does child IQ.

In [None]:
# Check data correlation between numerical columns

corr_vals = df.corr()
corr_vals
sns.heatmap(corr_vals, cmap="mako", annot=True)

No columns are strongly correlated (value > 0.7) to each other. Mother IQ is more strongly correlated to child IQ compared to mother age.

# Task 3 Comments:

- Child IQ's are skewed towards the 80-120 range
- It seems that mother IQ and mother high school education correlate with child IQ
- There is no significant relationship between mother workload and mother age with child IQ
- Added mother age group and mother education combined with workload as columns for analysis

# Task 4: Prediction Models

## 4.1: Using a single predictor

In [None]:
# Create a function to check different columns to use for linear regression with a single predictor
# Use Ordinary Least Squares Regression

def single_predictor(col_name):
    model = smf.ols(formula="child_iq ~ " + col_name, data=df).fit()
    print(model.summary())
    return model

# Create plotting function

def plot_single_predictor(model, col_name):
    intercept, slope = model.params 
    df_X = df[col_name].values.reshape(-1, 1)
    df_Y = df['child_iq'].values.reshape(-1, 1)
    pred_Y = slope * df_X + intercept # y = mx + b

    plt.scatter(df_X, df_Y)
    plt.plot(df_X, pred_Y, color='red')
    plt.xlabel(col_name)
    plt.ylabel('child_iq')
    plt.show()

In [None]:
# Use mother IQ as a predictor

model = single_predictor('mother_iq')

In [None]:
plot_single_predictor(model, 'mother_iq')

In [None]:
# Use mother age as a predictor

model = single_predictor('mother_age')

In [None]:
# Data is not suited for linear regression

plot_single_predictor(model, 'mother_age')

## 4.1 Comments:

- Based on the regression results, the `mother_iq` column provides the best linear regression model using a single predictor
- Assumptions:
    - Mother IQ and child IQ have some correlation (0.45)
- Coefficients:
    - R^2 = 0.201, higher than R^2 for the other model
    - `mother_iq` had a coefficient of 0.3338, positive correlation between `child_iq` and `mother_iq`
    - P>|t| = 0, 0% chance `mother_iq` has no effect on `child_iq`
    - Skew = -0.324, our data is slightly skewed

## 4.2: Using multiple predictors

In [None]:
# Model using child IQ, mother IQ, mother work, mother education, and mother age group

model = smf.ols(formula="child_iq ~ mother_iq + mother_hs + mother_age_group + mother_work", data=df).fit()
print(model.summary())

In [None]:
# Model using only mother IQ and mother high school education

model = smf.ols(formula="child_iq ~ mother_iq + mother_hs", data=df).fit()
print(model.summary())

## 4.2 Comments:

- Using multiple predictors seems to improve the model slightly
- The first model using mother IQ, mother workload, mother age group, mother workload, and mother education produces a better model
- Coefficients:
    - R^2 = 0.225, higher than R^2 for models in previous section
    - Largest positive correlation between `child_iq` and `mother_iq` (coefficient of 0.3156), all other variables had a lower, positive correlation
    - P>|t| = 0, 0% chance `mother_iq` has no effect on `child_iq`
    - Higher P>|t| values for `mother_age_group` and `mother_work`, higher probability they have no effect on `child_iq`, consistent with my findings in section 3
    - Skew = -0.278, our data is slightly skewed, but less than my best model from section 4.1

## 4.3 Using multiple predictors with an interaction term

In [None]:
model = smf.ols(formula="child_iq ~ mother_iq + mother_edu_work + mother_age_group + mother_iq:mother_edu_work", data=df).fit()
print(model.summary())

In [None]:
model = smf.ols(formula="child_iq ~ mother_iq + mother_edu_work + mother_iq:mother_edu_work", data=df).fit()
print(model.summary())

## 4.3 Comments:

- Using multiple predictors and an interaction term seems to improve the model slightly
- The first model using mother IQ, mother education with workload, and mother age group with an interaction term between mother IQ and mother education and mother workload produces a better model (R^2 = 0.242, adjusted R^2 = 0.222)
- Coefficients:
    - R^2 = 0.242, higher than R^2 for models in previous sections
    - Largest positive correlation between `child_iq` and `mother_iq` interacting with `mother_edu_work`(value 1) (coefficient of 0.2492)
        - Value 1: Mother did not work in first 3 years of child\'s life (w = 1) and mother did not graduate from high school (h = 0) OR Mother worked in second or third year of child\'s life (w = 2) and mother graduated from high school (h = 1)
    - Some other variables had a lower, positive correlation with `child_iq`: `mother_iq`, `mother_iq` interacting with `mother_edu_work` except for Type 3 work, and `mother_age_group`
    - `mother_edu_work` had a negative correlation with `child_iq` (most significant coefficient was -0.1360)
    - P>|t| = 0.012, ~1% chance `mother_iq` has no effect on `child_iq`
    - Very high (greater than 0.8) P>|t| values for `mother_iq` interacting with `mother_edu_work` (value 3) and `mother_edu_work` (value 3), high probability they have no effect on `child_iq`
        - Value 3: Mother worked full-time in first year of child\'s life (w = 4) and mother graduated from high school (h = 1) OR Mother worked part-time in first year of child\'s life (w = 3) and mother did not graduate from high school (h = 0)
    - Skew = -0.338, our data is slightly skewed

# Closing Words

Using multiple predictors with an interaction term produced the best linear regression model. However, the models produced here leave much to be desired in terms of predicting child IQ from the given data. 