## saving model using pickle

In [None]:
import pickle
with open('model_pickle','wb') as file:
    pickle.dump(model,file)
with open('model_pickle','rb') as file:
    mp = pickle.load(file)
mp.predict([[5000]])

## saving model using joblib

In [None]:
import joblib
joblib.dump(model, 'model_joblib')
mj = joblib.load('model_joblib')
mj.predict([[5000]])

## Using pandas to create dummy variables

In [None]:
import pandas as pd
dummies = pd.get_dummies(df.town)
merged = pd.concat([df,dummies],axis='columns')
final = merged.drop(['town'], axis='columns')

## hot label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.town = le.fit_transform(df.town)
df

## using one hot encoding to create dummy variables

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)
# here x is an 2d array in which columns are features and rows are rows...

## train test splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42) 

## confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted)


## stratified shuffle split

In [None]:
# here there is a problem, for eg CHAS hos only two values 0 and 1, 0-475 and 1-35
# but in spliting train and test data suppose traindata = 402, testdata=104
# if there are no 1's in train data then our program forms wrong pattern that 
# there is only one possibility of CHAS i.e, 1
# to solve above problem
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

## looking for Correlations

In [None]:
corr_matrix = housing.corr()
corr_matrix['label_name'].sort_values(ascending=False)
# if value is 1 meaning strong positive correlation
# if value is -1 meaning strong negative correlation
# next Rm value is 0.69 which is high positive correlation which means if RM increases chances
# of increasing MEDV increases, then ZN and B are weak positive correlation
# similarly lstat is high neg corr, lesser value of lstat higher value of medv

## Missing attributes

In [None]:
# To take care of missing attributes, you have three options:
#     1. Get rid of the missing data points
    a=housing.dropna(subset=["RM"])

#     2. Get rid of the whole attribute
    housing.drop("RM", axis=1).shape 

#     3. Set the value to some value(0, mean or median)
    housing["RM"].fillna(housing["RM"].median())
    housing["RM"].fillna(housing["RM"].mean())
    housing["RM"].fillna(housing["RM"].mode()[0])# for object type
    

## Creating a pipeline

In [None]:
# instead of doing imputer, directly you can opt for pipeline which automates things
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
#     add as many as you want...
    ('std_scaler',StandardScaler())
])

## Using better evaluation techniques - Cross Validation

In [None]:
# for eg 1 2 3 4 5 6 7 8 9, firstly it will train expect 1 and check for 1, then repeat it for 2, 3..
# so on.. to last value
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

## Min max scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(df[['Income($)']])
df['Income($)'] = scaler.transform(df[['Income($)']])

scaler.fit(df[['Age']])
df['Age'] = scaler.transform(df[['Age']])

## Count vectoriser

In [None]:
# to convert sentences in data frame to some numbers
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)

## hyper parameter tuning with grid search cv

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}


from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df