<a href="https://colab.research.google.com/github/mariumabid/AI-challenge-/blob/master/2_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from sklearn import preprocessing

### Data pre-processing

Only alter the data pre-processing code if you have completed the challenge for that section.

In [0]:
# The data URI
csv_file_uri = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "target"
]


data_original = pd.read_csv(csv_file_uri, names=column_names, index_col=False)

USE_LABEL_ENCODER = False


if USE_LABEL_ENCODER:

    # Make a copy so that we always have the original data to refer to
    data = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data.drop(["fnlwgt"], axis=1, inplace=True)

    # Create a function that changes the text to a simple binary value
    def convert_target_variable(text):
        if text == " <=50K":
            return 0
        else:
            return 1

    data["target_encoded"] = data.target.apply(convert_target_variable)

    # Deletes the original column in this dataframe.
    data.drop(["target"], axis=1, inplace=True)

    encoded_columns = []
    for c in data.columns:
        if data[c].dtype == "object":
            if "{}_encoded".format(c) not in data.columns:
                encoder = preprocessing.LabelEncoder()
                data["{}_encoded".format(c)] = encoder.fit_transform(data[c].values)
                encoded_columns.append(c)
                encoder = None
            else:
                print("{}_encoded already exists".format(c))

    print("Dropping the encoded columns {}".format(encoded_columns))
    data.drop(encoded_columns, axis=1, inplace=True)
    
else:
    
    # Make a copy so that we always have the original data to refer to
    data_pre_dummies = data_original.copy(deep=True)

    # Drop the US weights (don't have any value)
    data_pre_dummies.drop(["fnlwgt"], axis=1, inplace=True)
    
    data = pd.get_dummies(data_pre_dummies)

    # Deletes the original column in this dataframe.
    data.drop(["target_ <=50K"], axis=1, inplace=True)

    # Rename the target
    data.rename(columns={'target_ >50K': 'target' }, inplace=True)

---
### Model

This is the moment we can use the new, numerical, data to plug it into pretty much any classification model. First we'll convert the data to a matrix with our features - that is the data that we want to use to predict from - and an array with our labels - the target variable that indicates if someone makes more than 50k or not.

In [0]:
feature_columns = data.columns.tolist()
feature_columns.remove("target")

In [0]:
X = data[feature_columns].values
y = data["target"].values

print(X.shape)
print(y.shape)

(32561, 107)
(32561,)


In [0]:
# Quick tips on how to index matrices/arrays

# The first ROW
print(X[1,:])

# The first TWO ROWS
print(X[:2,:])

# The last ROW
print(X[-1,:])

# The first 3 ROWS with only the last TWO COLUMNS
print(X[:3,-2:])

[50 13  0  0 13  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  1
  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0
  0  0  0  0  1  0  0  0  0  0  0  0  0  0  1  0  1  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  1  0  0]
[[  39   13 2174    0   40    0    0    0    0    0    0    0    1    0
     0    0    0    0    0    0    0    0    0    1    0    0    0    0
     0    0    0    0    0    0    1    0    0    0    1    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    1    0    0
     0    0    0    0    0    0    1    0    1    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    1    0    0]
 [  50   13    0    0   13    0    0    0    0    0    0    1    0    0
     0    0    0    0    0    0    0    0    0   

Lets use a model from **scikit-learn**: LogisticRegression

For those interested in the [documentation](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression)

In [0]:
from sklearn import linear_model

# Create linear regression object
clf = linear_model.LogisticRegression()

In [0]:
# MAGIC
# 
# Train the model using the training sets
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [0]:
# Make predictions using the testing set. 
# For now we'll use the last value of the training set.
pred = clf.predict(X[-1,:].reshape(1,-1))

In [0]:
# Print the data 
print(data_original.iloc[data_original.index[-1]])

# and the prediction
print("Predict its a {}".format(pred))

probability = clf.predict_proba(X[-1,:].reshape(1,-1))
print("With a probability of {}".format(probability[0,pred[0]]))

age                                52
workclass                Self-emp-inc
fnlwgt                         287927
education                     HS-grad
education-num                       9
marital-status     Married-civ-spouse
occupation            Exec-managerial
relationship                     Wife
race                            White
sex                            Female
capital-gain                    15024
capital-loss                        0
hours-per-week                     40
native-country          United-States
target                           >50K
Name: 32560, dtype: object
Predict its a [1]
With a probability of 0.9941035934636746


In [0]:
# How did the model do this? We can see the coefficients 
# for each column using 
# print(clf.coef_)

# Pretty print with the column names
for ix, c in enumerate(feature_columns):
    print("Column {} is {}".format(c, clf.coef_[0][ix]))

Column age is 0.023416553291590952
Column education-num is 0.15128177678973417
Column capital-gain is 0.0003111700725644025
Column capital-loss is 0.0006361900682646348
Column hours-per-week is 0.02807788322366632
Column workclass_ ? is -0.5690943712804999
Column workclass_ Federal-gov is 0.21952833795020954
Column workclass_ Local-gov is -0.46820920929449106
Column workclass_ Never-worked is -0.006914877746796402
Column workclass_ Private is -0.3061065500279686
Column workclass_ Self-emp-inc is -0.15350829847916853
Column workclass_ Self-emp-not-inc is -0.7902409255086751
Column workclass_ State-gov is -0.6081554667087883
Column workclass_ Without-pay is -0.052733989039708516
Column education_ 10th is -0.6651138743054429
Column education_ 11th is -0.7633972069283986
Column education_ 12th is -0.3483340309046212
Column education_ 1st-4th is -0.21355551865315978
Column education_ 5th-6th is -0.3746150259806669
Column education_ 7th-8th is -0.7674462429905754
Column education_ 9th is -0.

In [0]:
# How good is the model by evaluating it 
# on the training set
print("This model has an overall accuracy of {}".format(clf.score(X, y)))

This model has an overall accuracy of 0.8512023586499187


**The above is incredibly bad practice (up to the point where I should be fired for even showing you). Why?**