# Linear Regressions

## Basic Linear Regressions
We now turn to basic linear regressions:
- **dependent variable**: `ACTOP`
- **independent variables**: `AGE5`, `DIP11`, `SEXE`

In [None]:
# Connect to database and load the data we need into a DataFrame
with sqlite3.connect(eedb) as con:
    query = "SELECT actop, age5, dip11, sexe FROM eec15"
    df = pd.read_sql_query(query, con)
    
# Refactor the data in the format we need for regressions
# In particular, convert categorical variables into dummy variables

## ACTOP: drop NULL, map ('1', '2') -> (0, 1)
df = df[df.actop != ""]
df["actop"] = df["actop"].map({'1': 0, '2': 1})

## AGE5: split into AGE{15,30,40,50} columns (ignore "00" and "60")
df = df[df.age5 != ""]
df.age5 = df.age5.astype(int)
for x in list(set(df["age5"].get_values())):
    df["age{}".format(x)] = np.where(df["age5"] == x, 1, 0)
    
## DIP11: group/split into DIP{0,1,3,4,5,6,7}
df = df[df.dip11 != ""]
df.dip11 = df.dip11.astype(int)
for x in list(set(df["dip11"].get_values())):
    df["dip{}".format(x)] = np.where(df["dip11"] == x, 1, 0)

## SEXE
df["female"] = df["sexe"].map({'1': 0, '2': 1})

df.head()

In [None]:
import statsmodels.api as sm

# Get rid of data we don't need in the regression
df = df[df.age60 == 0]

# https://www.datarobot.com/blog/multiple-regression-using-statsmodels/
female = ["female"]
age = ["age{}".format(x) for x in ["15", "30", "40", "50"][:-1]] # ignore 50 (dummy variable trap) and 60 (no data)
# diploma = ["dip{}".format(x) for x in ["10", "11", "30", "31", "33", "41", "42", "50", "60", "70", "71"][:-1]] # careful with dip11 (same name as original column)
X = female + age #+ diploma
X = df[X]
y = df["actop"]
X = sm.add_constant(X)
est = sm.OLS(y, X).fit()
est.summary()

# TODO:
    # talk about the "dummy variable" trap
    # fix the DIP11 bug
    # give an example of the model and what the dummies represent (for reference)
        # see "specification du modele" in your notes