## Getting sense out of Data

In [None]:
## Checking dataframe columns, NaNs, memory usage
print(df.info())

## Getting summarize from numerical columns in dataset
print(df.describe())

## Checking for NAs
df.isnull().sum()
# Getting list of all NAs for a specific column
df['COLUMN_WITH_NA'][np.isnan(df['COLUMN_WITH_NA'])]

## Check value counts for categorical columns
# Single column
pd['COLUMN_NAME'].value_counts()
# Multiple columns
pd.crosstab(df['CAT_COL_1'], df['CAT_COL_2'])

## With assert statement we can check a lot of thing. For example
# assert data.columns[1] == 'Name'
# assert data.Speed.dtypes == np.int
assert df['COLUMN'].notnull().all()

## Data Exploring

In [None]:
## Selecting rows and columns in pandas
df['COLUMN']['ROW']
# similar to:
df.COLUMN['ROW']
# similar to:
df.loc['ROW',['COLUMN']]
# dataframe slicing
df.loc[0:9,"COL_1":"COL_n"] #first 10 rows including col_1 to col_n
df.loc[10:0:-1,"COL_1":"COL_n"] #first 10 rows/backward including col_1 to col_n

## Selecting dataframe by having multiple conditions
df[(df.COL1 > a) & (df.COL1 < b)]

## Grouping dataframe rows based on some columns
df[['COL_1', 'COL_2']].groupby(['COL_1'], as_index=False).mean()
df.groupby('COL_1').COL_2.agg(['mean','min','max'])

## Getting all columns except some
# First approach
df.loc[:, df.columns != 'EXCLUDE_COL']
# Second approach
df[df.columns.difference(['list_of_excluded_cols'])]

## Data manipulating

In [None]:
## MISSING VALUES, NAs
df["COLUMN"].dropna(inplace = True)  # inplace = True means we do not assign it to new variable. Changes automatically assigned to data

## Add new column to dataset on desired conditions
# look in df and for rows with 'column_name' = 'some_value', set a 'new_column'  with 'desired_value'
df.loc[df['COLUMN_NAME'] == 'SOME_VALUE', 'NEW_COLUMN'] = 'DESIRED_VALUE'

## Convert numeric column into categories of range values
# for having same number of records in each bin we use qcut
df['NEW_CAT_COL'] = pd.qcut(df['NUM_COL'], 'FOLDS_NUM')
# for having evenly spaced bins we use cut
df['NEW_CAT_COL'] = pd.cut(df['NUM_COL'], 'FOLDS_NUM')

## Filling NAs in dataframe
df['COLUMN_NAME'] = df['COLUMN_NAME'].fillna('most_occurred_value')
df['COLUMN_NAME'] = df['COLUMN_NAME'].fillna(df['COLUMN_NAME'].median())

## Apply a method over a column
# Titanic dataset: looking for title from Name column
import re
def title_finder(value):
    title = re.search('([A-Za-z]+)\.', value)
    if title:
        return title.group(1)
    else:
        return ""
train['Title'] = train['Name'].apply(title_finder)

## Replacing values in column
df['COLUMN'] = df['COLUMN'].replace("OLD_VALUE", "NEW_VALUE") #old_value can come in list as well.

## Searching in column values and replace
df["COLUMN_NEW"] = np.where(df["COLUMN"].str.contains("SOME_STRING"), 1, other=0)

## Dropping a or a group of fields from dataframe
drop_elements = ['COLUMN_1', 'COLUMN_2', 'COLUMN_3']
df = df.drop(drop_elements, axis = 1)

## Melt
# id_vars = what we do not wish to melt
# value_vars = what we want to melt
pd.melt(frame=df,id_vars = 'COLUMN', value_vars= ['list_of_melting_columns'])

## Pivoting (reverse melt)
# Index is name
# I want to make that columns are variable
# Finally values in columns are value
df.pivot(index = 'COLUM', columns = 'variable_col',values='value_col')

## CONCATENATING DATA
conc_data_row = pd.concat([data1,data2],axis =0,ignore_index =True) # axis = 0 : adds dataframes in row

# CONVERTING DATA TYPES
df['COLUMN'] = df['COLUMN'].astype('category') # object(string),bool, int, float and category

## Pre-processing Data
#### Normalization

In [None]:
import numpy as np
from sklearn import preprocessing

input_data = np.array([[5.1, -2.9, 3.3],
                       [-1.2, 7.8, -6.1],
                       [3.9, 0.4, 2.1],
                       [7.3, -9.9, -4.5]])

# Binarize data 
data_binarized = preprocessing.Binarizer(threshold=2.1).transform(input_data)
print("\nBinarized data:\n", data_binarized)

# Print mean and standard deviation
print("\nBEFORE:")
print("Mean =", input_data.mean(axis=0))
print("Std deviation =", input_data.std(axis=0))

# Remove mean
data_scaled = preprocessing.scale(input_data)
print("\nAFTER:")
print("Mean =", data_scaled.mean(axis=0))
print("Std deviation =", data_scaled.std(axis=0))

# Min max scaling
data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled_minmax = data_scaler_minmax.fit_transform(input_data)
print("\nMin max scaled data:\n", data_scaled_minmax)

# Normalize data
data_normalized_l1 = preprocessing.normalize(input_data, norm='l1')
data_normalized_l2 = preprocessing.normalize(input_data, norm='l2')
print("\nL1 normalized data:\n", data_normalized_l1)
print("\nL2 normalized data:\n", data_normalized_l2)

#### Dummy features

In [None]:
# Load data
data = pd.read_csv('file_path')
# It automatically converts all the categories to dummy featuers
df = pd.get_dummies(data)
# For binary categories we need to drop one of the dummy columns since they are redundant
df.drop("dummy_1",axis = 1, inplace = True)

## Label Encoding
Label encoding refers to the process of transforming the word labels into numerical form. This enables the algorithms to operate on our data.

In [None]:
# First approach: using preprocessing module
import numpy as np
from sklearn import preprocessing

lb_encoder = LabelEncoder()
df["COLUMN_CAT"] = lb_encoder.fit_transform(df["COLUMN"])

# Creating dummy varaibles for categorical columns
from sklearn.preprocessing import LabelBinarizer
lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(df["COLUMN"])
pd.DataFrame(lb_results, columns=lb_style.classes_).head()

In [None]:
# Second approach: using pandas to convert the column type to category
df['COLUMN'] = df['COLUMN'].astype('category')
df['COLUMN_CATEGORY'] = df['COLUMN_CATEGORY'].cat.codes

In [None]:
# Third approach: mapping manually
df['COLUMN'] = df['COLUMN'].map( {'VALUE_1': 0, 'VALUE_2': 1, 'VALUE_3': 2} ).astype(int)

In [None]:
# Convert string data to numerical data
# imagine data is a list of lists or a np.array
label_encoder = []

# First construct the transformed data skeleton
X_encoded = np.empty(data.shape)

for i, item in enumerate(data[0]): 
      if item.isdigit(): 
            X_encoded[:, i] = data[:, i] 
      else: 
            label_encoder.append(preprocessing.LabelEncoder()) 
            X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i]) 
            
# imagine last column is the outcome (y)
X = X_encoded[:, :-1].astype(int) 
y = X_encoded[:, -1].astype(int) 

##  Plotting

In [None]:
## Scatter plot
import matplotlib.pyplot as plt

# Visualize input data 
plt.figure()
plt.scatter(X, 
            y, 
            s=75, 
            facecolors='white',
            edgecolors='black', 
            linewidth=1, 
            marker='s')
plt.title('Input data') 

## Line plot
df.COL1.plot(kind = 'line', 
             color = 'g',
             label = 'LABEL',
             linewidth=1,
             alpha = 0.5,
             grid = True,
             linestyle = ':')
# add more line into same plot
df.COL2.plot(color = 'r',
             label = 'Defense',
             linewidth=1, 
             alpha = 0.5,
             grid = True,
             linestyle = '-.')
plt.legend(loc='upper right')     # legend = puts label into plot
plt.xlabel('x axis')              # label = name of label
plt.ylabel('y axis')
plt.title('Line Plot')            # title = title of plot

## Histogram
# bins = number of bar in figure
df.COL.plot(kind = 'hist',bins = 10)

## Boxplot
df.boxplot(column='y_column',by = 'x_column')

## Ploting tricks

## Using a scatter plot to draw graph with multiple marker
plt.figure() 
plt.title('Multiple markers') 
marker_shapes = 'v^os' 
mapper = [marker_shapes[i] for i in range(0,4)] 
for i in range(df.shape[0]): 
    plt.scatter(df[i, 0], df[i, 1], marker=mapper[i],  
            s=75, edgecolors='black', facecolors='none') 

## Logistic Regression classifier

In [None]:
# Create the logistic regression classifier
classifier = linear_model.LogisticRegression(solver='liblinear', C=1)
#classifier = linear_model.LogisticRegression(solver='liblinear', C=100)

# Train the classifier
classifier.fit(X, y)

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(X, y)
accuracy_NB = nb_clf.score(features_test, outcomes_test)
print(accuracy_NB)

## Export model which can be used later

In [None]:
import pickle 
# Model persistence 
output_model_file = 'model.pkl' 
 
# Save the model 
with open(output_model_file, 'wb') as f: 
    pickle.dump(model_obj, f) 

## Regression

In [None]:
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm 

# Create linear regressor object 
regressor = LinearRegression() 
 
# Train the model using the training sets 
regressor.fit(X_train, y_train) 

# Predict the output 
y_test_pred = regressor.predict(X_test) 

# Compute performance metrics 
print("Linear regressor performance:") 
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2)) 
print("Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2))  
print("Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2))  
print("Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2)) 
print("R2 score =", round(sm.r2_score(y_test, y_test_pred), 2)) 

## Cross Validation and train/test split

In [None]:
## Avioding overfitting with splitting data into train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=1)


## Cross-validtion KFold
from sklearn.model_selection import KFold

KFold(n_splits=2, random_state=None, shuffle=False)
for train_index, test_index in kf.split(df):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#### Cross-validation recommendations
 - K can be any number, but K=10 is generally recommended
 - For classification problems, **stratified sampling** is recommended for creating the folds

In [None]:
from sklearn.cross_validation import cross_val_score

# Model could be  any model, Regression, kNN, DecisionTrees ...
scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
print(scores)
# use average accuracy as an estimate of out-of-sample accuracy
print(scores.mean())

In [None]:
## Another method to have stratified cross-validated sampling whould be as follows:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
for train_index, test_index in sss.split(df_features, df_outcome):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

## Regularized Regression
As we learn linear regression choose parameters (coefficients) while minimizing lost function. If linear regression thinks that one of the feature is important, it gives high coefficient to this feature. However, this can cause overfitting that is like memorizing in KNN. In order to avoid overfitting, we use regularization that penalize large coefficients.

$$ OLS = Ordinary Least Square = SumAllResidulas $$
$$ L1: LessoRegressionLostFunction = OLS +\ alpha \times \sum\lvert parameter\rvert $$

$$ L2: RidgeRegressionLostFuction = OLS +\ alpha \times \sum(parameter)^{2}$$

In [None]:
## First Approach: Ridge regression or L2
from sklearn.linear_model import Ridge

x_train,x_test,y_train,y_test = train_test_split(x,
                                                 y,
                                                 random_state=2, 
                                                 test_size=0.3)
# alpha is a hyperparameter(0,1): small alpha => overfitting, large => underfitting
ridge = Ridge(alpha = 0.1, normalize = True)
ridge.fit(x_train,y_train)
ridge_predict = ridge.predict(x_test)
print('Ridge score: ',ridge.score(x_test,y_test))

## Second Approach: Lesso regression or L1
from sklearn.linear_model import Lasso

x_train,x_test,y_train,y_test = train_test_split(x,
                                                 y,
                                                 random_state=3, 
                                                 test_size=0.3)
lasso = Lasso(alpha = 0.1, normalize = True)
lasso.fit(x_train,y_train)
ridge_predict = lasso.predict(x_test)
print('Lasso score: ',lasso.score(x_test,y_test))
print('Lasso coefficients: ',lasso.coef_)

## Logistic Regression and ROC Curve
- Logistic regression output is probabilities.
- By default logistic regression threshold is 0.5
- In ROC curve, x axis is false positive rate and y axis is true positive rate.
- If the curve in plot is closer to left-top corner, test is more accurate.
- Roc curve score is auc that is computation area under the curve from prediction scores: We want auc to closer 1.
- **fpr** = False Positive Rate
- **tpr** = True Positive Rate

In [None]:
# ROC Curve with logistic regression
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state=42)
logreg = LogisticRegression()
logreg.fit(x_train,y_train)
y_pred_prob = logreg.predict_proba(x_test)[:,1]

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.show()

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier 
# Decision Trees classifier 
# random_state: The seed used by the random number generator required for algorithm
# max_depth: the maximum depth of the tree that we want to construct
params = {'random_state': 0, 'max_depth': 4} 
classifier = DecisionTreeClassifier(**params) 
classifier.fit(X_train, y_train) 

y_test_pred = classifier.predict(X_test) 

# Evaluate classifier performance: Imagine there are two classes in our data
class_names = ['Class-0', 'Class-1'] 
print("\n" + "#"*40)
print("\nClassifier performance on training dataset\n") 
print(classification_report(y_train, 
                            classifier.predict(X_train), 
                            target_names=class_names)) 
print("#"*40 + "\n") 
 
print("#"*40) 
print("\nClassifier performance on test dataset\n") 
print(classification_report(y_test, 
                            y_test_pred, 
                            target_names=class_names)) 
print("#"*40 + "\n")

## RandomForest / Extremely RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
# The n_estimators parameter refers to the number of trees that will be constructed.
params = {'n_estimators': 100, 
          'max_depth': 4, 
          'random_state': 0}
# Random Forest model
classifier = RandomForestClassifier(**params)
classifier.fit(X_train, y_train)

# Extremely Random Forest model
classifier = ExtraTreesClassifier(**params)
classifier.fit(X_train, y_train)

# For a classifier to perform well, it needs to see equal number of points for each class. 
# But when we collect data in the real world, it's not always possible to ensure that 
# each class has the exact same number of data points. 
# If one class has 10 times the number of data points of the other class, 
# then the classifier tends to get biased towards the first class.
params = {'n_estimators': 100, 
          'max_depth': 4, 
          'random_state': 0, 
          'class_weight': 'balanced'}
classifier = RandomForestClassifier(**params)

## HyperParameter Tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

## First example: applied on ExtraTreesClassifier
# Define the parameter grid  
parameter_grid = [{'n_estimators': [100], 'max_depth': [2, 4, 7, 12, 16]},
                  {'max_depth': [4], 'n_estimators': [25, 50, 100, 250]}]

metrics = ['precision_weighted', 'recall_weighted'] 

for metric in metrics: 
      print("\n##### Searching optimal parameters for", metric) 
 
      classifier = GridSearchCV(ExtraTreesClassifier(random_state=0),
                                parameter_grid, 
                                cv=5, 
                                scoring=metric)
    
      classifier.fit(X_train, y_train)

      print("\nGrid scores for the parameter grid:") 
      for params, avg_score, _ in classifier.grid_scores_: 
            print(params, '-->', round(avg_score, 3)) 
 
      print("\nBest parameters:", classifier.best_params_) 

      y_pred = classifier.predict(X_test) 
      print("\nPerformance report:\n") 
      print(classification_report(y_test, y_pred))
        
        
## Second example: applied on kNN
grid = {'n_neighbors': np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, 
                      grid, 
                      cv=3) # GridSearchCV
knn_cv.fit(x,y)# Fit

# Print hyperparameter
print("Tuned hyperparameter k: {}".format(knn_cv.best_params_)) 
print("Best score: {}".format(knn_cv.best_score_))

## Third example: applied on Logisitic Regression
# 1. hyperparameter is C:logistic regression regularization parameter
# 2. penalty l1 or l2
# Hyperparameter grid
param_grid = {'C': np.logspace(-3, 3, 7), 'penalty': ['l1', 'l2']}
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 12)
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg,
                         param_grid,
                         cv=3)

logreg_cv.fit(x_train,y_train)

# Print the optimal parameters and best score
print("Tuned hyperparameters : {}".format(logreg_cv.best_params_))
print("Best Accuracy: {}".format(logreg_cv.best_score_))

## Confusion matrix

In [None]:
import matplotlib.pyplot as plt 
from sklearn.metrics import confusion_matrix

# Define sample labels 
true_labels = [2, 0, 0, 2, 4, 4, 1, 0, 3, 3, 3] 
pred_labels = [2, 1, 0, 2, 4, 3, 1, 0, 1, 3, 3] 
# Create confusion matrix 
confusion_mat = confusion_matrix(true_labels, pred_labels) 
print(confusion_mat)

## Visualize confusion matrix
# By using matplot
plt.imshow(confusion_mat, interpolation='nearest', cmap=plt.cm.gray) 
plt.title('Confusion matrix') 
plt.colorbar() 
ticks = np.arange(5) 
plt.xticks(ticks, ticks) 
plt.yticks(ticks, ticks) 
plt.ylabel('True labels') 
plt.xlabel('Predicted labels') 
plt.show()

# by using seaborn
import seaborn as sns
sns.heatmap(confusion_mat, annot=True, fmt="d")

## Precision / Recall / f1
Lets discuss accuracy. Is it enough for measurement of model selection. For example, there is a data that includes 95% normal and 5% abnormal samples and our model uses accuracy for measurement metric. Then our model predict 100% normal for all samples and accuracy is 95% but it classify all abnormal samples wrong. Therefore we need to use **confusion matrix** as a model measurement matris in **imbalance data**.

**Precision:** accuracy of the model:
$$ Precision = \frac {TP}{(TP+FP)} $$

**Recall:** the number of items that were retrieved as a percentage of the overall number of items that were supposed to be retrieved.
$$ Recall = \frac {TP}{(TP+FN)} $$

Good classifier => High:Precision High:Recall

$$ f1 = \frac {2 \times Precision \times Recall}{(Precision + Recall)} $$

In [None]:
from sklearn.metrics import classification_report
# Classification report 
targets = ['Class-0', 'Class-1', 'Class-2', 'Class-3', 'Class-4'] 
print('\n', classification_report(true_labels, pred_labels, target_names=targets)) 

## Computing relative feature importance

In [None]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, explained_variance_score

# AdaBoost Regressor model 
regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),  
            n_estimators=400, random_state=7) 
regressor.fit(X_train, y_train)

# Evaluate performance of AdaBoost regressor 
y_pred = regressor.predict(X_test) 
print("\nADABOOST REGRESSOR") 
print("Mean squared error =", round(mean_squared_error(y_test, y_pred), 2)) 
print("Explained variance score =", round(explained_variance_score(y_test, y_pred) , 2)) 

# Extract feature importances 
feature_importances = regressor.feature_importances_ 
feature_names = housing_data.feature_names 

# Sort the values and flip them 
index_sorted = np.flipud(np.argsort(feature_importances)) 

# Arrange the X ticks 
pos = np.arange(index_sorted.shape[0]) + 0.5 

# Plot the bar graph 
plt.figure() 
plt.bar(pos, feature_importances[index_sorted], align='center') 
plt.xticks(pos, feature_names[index_sorted]) 
plt.ylabel('Relative Importance') 
plt.title('Feature importance using AdaBoost regressor') 
plt.show()

## KMeans

In [None]:
from sklearn.cluster import KMeans

# Create KMeans object
# init: use k-means++ to select these centers in a smarter way.
# n_init: refers to the number of times the algorithm should run before deciding upon the best outcome.
kmeans = KMeans(init='k-means++', n_clusters='SOME_NUMBER', n_init=10) 
kmeans.fit(df)
labels = kmeans.predict(df)

#### Tuning Hyperparameter for Kmeans
**inertia**: How speard out the clusters are distance from each other.
**What is the best number of clusters**? There are low inertia and not too many cluster trade off so we can choose elbow.

In [None]:
inertia_list = np.empty(10)
for i in range(1,10):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(df)
    inertia_list[i] = kmeans.inertia_
plt.plot(range(0,8),inertia_list,'-o')
plt.xlabel('Number of cluster')
plt.ylabel('Inertia')
plt.show()

#### Estimating the number of clusters with Mean Shift algorithm
In the Mean Shift algorithm, we consider the whole feature space as a probability density function. We start with the training dataset and assume that they have been sampled from a probability density function. In this framework, the clusters correspond to the local maxima of the underlying distribution. If there are K clusters, then there are K peaks in the underlying data distribution and Mean Shift will identify those peaks.

The goal of Mean Shift is to identify the location of centroids.

Estimate the bandwidth of the input data. Bandwidth is a parameter of the underlying kernel density estimation process used in Mean Shift algorithm. The bandwidth affects the overall convergence rate of the algorithm and the number of clusters that we will end up with in the end. Hence this is a crucial parameter. If the bandwidth is small, it might results in too many clusters, where as if the value is large, then it will merge distinct clusters.

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth 

# Estimate the bandwidth of X 
bandwidth_X = estimate_bandwidth(X, quantile=0.1, n_samples=len(X)) 

# Cluster data with MeanShift 
meanshift_model = MeanShift(bandwidth=bandwidth_X, bin_seeding=True) 
meanshift_model.fit(X) 

# Extract the centers of clusters 
cluster_centers = meanshift_model.cluster_centers_ 
print('\nCenters of clusters:\n', cluster_centers) 

# Estimate the number of clusters 
labels = meanshift_model.labels_ 
num_clusters = len(np.unique(labels)) 
print("\nNumber of clusters in input data =", num_clusters)

#### Estimating the quality of clustering with silhouette scores
$$ silhouette score = (p - q) / max(p, q) $$

Here, p is the mean distance to the points in the nearest cluster that the data point is not a part of, and q is the mean intra-cluster distance to all the points in its own cluster.

The value of the silhouette score range lies between -1 to 1. A score closer to 1 indicates that the data point is very similar to other data points in the cluster, whereas a score closer to -1 indicates that the data point is not similar to the data points in its cluster. 

In [None]:
from sklearn import metrics 
from sklearn.cluster import KMeans 

# Initialize variables 
scores = [] 
values = np.arange(2, 10) 

# Iterate through the defined range 
for num_clusters in values: 
    # Train the KMeans clustering model 
    kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10) 
    kmeans.fit(X) 

    score = metrics.silhouette_score(X, kmeans.labels_,  
            metric='euclidean', sample_size=len(X)) 

    print("\nNumber of clusters =", num_clusters) 
    print("Silhouette score =", score)                  
    scores.append(score) 

# Extract best score and optimal number of clusters 
num_clusters = np.argmax(scores) + values[0] 
print('\nOptimal number of clusters =', num_clusters) 

## Training Pipeline

In [None]:
from sklearn.datasets import samples_generator 
from sklearn.feature_selection import SelectKBest, f_regression 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import ExtraTreesClassifier

# Generate data  
X, y = samples_generator.make_classification(n_samples=150,  
        n_features=25, n_classes=3, n_informative=6,  
        n_redundant=0, random_state=7) 

# Select top K features  
k_best_selector = SelectKBest(f_regression, k=9) 

# Initialize Extremely Random Forests classifier  
classifier = ExtraTreesClassifier(n_estimators=60, max_depth=4) 

# Construct the pipeline 
processor_pipeline = Pipeline([('selector', k_best_selector), ('erf', classifier)])

# Set the parameters 
processor_pipeline.set_params(selector__k=7, erf__n_estimators=30) 

# Training the pipeline  
processor_pipeline.fit(X, y)

# Predict outputs for the input data 
output = processor_pipeline.predict(X) 
print("\nPredicted output:\n", output) 

# Print scores  
print("\nScore:", processor_pipeline.score(X, y)) 

## Python pass function parameters as a dictionary

In [None]:
# Defining a dict by having all desired arguments
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
# Pass the dict varaible to the function caller by preceding with ** 
classifier = RandomForestClassifier(**params)

## Python argument parser

In [None]:
import argparse

# Argument parser  
def build_arg_parser(): 
      parser = argparse.ArgumentParser(description='Classify data using Ensemble Learning techniques') 
      parser.add_argument('--classifier-type', dest='classifier_type',  
                  required=True, choices=['rf', 'erf'], help="Type of classifier to use; can be either 'rf' or 'erf'") 
      return parser 
    
if __name__=='__main__': 
      # Parse the input arguments 
      args = build_arg_parser().parse_args() 
      classifier_type = args.classifier_type

      if classifier_type == 'rf': 
            classifier = RandomForestClassifier(**params) 
      else: 
            classifier = ExtraTreesClassifier(**params) 

## Then call the script like this:
$ python3 SCRIPT_NAME.py --classifier-type rf

In [None]:
# Second approach is by using sys package
import sys

# within script check for arguments passed to the script and do the related actions
if len(sys.argv) > 1: 
      if sys.argv[1] == 'SOME_VALUE': 
        ## DO SOMETHING
      else: 
        ## DO SOMETHING ELSE

## Then call the script like this:
$ python3 SCRIPT_NAME.py SOME_VALUE

## Reading files manually

In [None]:
data = []
with open(input_file, 'r') as f:
    for line in f.readlines():
        items = line[:-1].split(',')
        data.append(items)

## Pandas setting for avoiding truncating

In [None]:
pd.options.display.max_columns = 4000
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

## Print output spacing

In [7]:
# print the contents of each training and testing set
print('{} {:^61} {}'.format('column1', 
                            'column2_with_space', 
                            'column3'))

column1                      column2_with_space                       column3


## Runtime timing

In [1]:
import timeit
normal_py_sec = timeit.timeit('sum(x*x for x in range(1000))',
                              number=10000)
naive_np_sec = timeit.timeit(
                'sum(na*na)',
                setup="import numpy as np; na=np.arange(1000)",
                number=10000)
good_np_sec = timeit.timeit(
                'na.dot(na)',
                setup="import numpy as np; na=np.arange(1000)",
                number=10000)
print("Normal Python: %f sec" % normal_py_sec)
print("Naive NumPy: %f sec" % naive_np_sec)
print("Good NumPy: %f sec" % good_np_sec)

Normal Python: 0.819417 sec
Naive NumPy: 0.795714 sec
Good NumPy: 0.012523 sec


In [None]:
## Second approach: using jupyter notebook magic functions
# means loop 100 times the following statements in the current cell
%%timeit -n 100
# TO SOMETHING