In [20]:
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 24 12:21:30 2017

@author: Luca
"""

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import ensemble
from sklearn import tree
from sklearn.metrics import roc_curve,f1_score,auc
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.linear_model as lm
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import pickle


def dataCleaning(df, features_to_delete):
    for feature in features_to_delete:
        df.drop(feature, axis=1, inplace=True)
        
    df = df.replace(np.nan, "")
    df = df.replace("NaN", "")
    df['was_liked'] = np.where(df['was_liked'] == "FALSE",0,1)
    return df


def checkTicketsDistribution(df):
    num_true = len(df.loc[df["was_liked"] == 1])
    num_false = len(df.loc[df["was_liked"] == 0])
    print("Number of True cases: {0} ({1:2.2f}%)".format(num_true, (num_true/(num_true+num_false))*100))
    print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/(num_true+num_false))*100))

    
def splitData(df, X, y):
    split_test_size = 0.20
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = split_test_size, random_state = 42)

    return X_train, y_train, X_test, y_test


def checkColumnsTypes(df):
    print("Data types and their frequency: \n{}".format(df.dtypes.value_counts()))
    print("")
    object_columns_df = df.select_dtypes(include=['object'])
    print(object_columns_df.iloc[0])    


In [22]:
file = "DATASET10000.csv"
features_to_delete = ["index","URL", "Date-GMT","Comments","Likes",
                        "Location_ID", "Location_Name",
                        "lat", "lng", "Original_Photo",
                        "Standard_Resolution_Photo_640", "Low_Resolution_Photo_320", "Thumbnail_Photo_150",
                        "username","Full_Name","User_ID","Profile_Picture"]

# FROM CSV TO DATAFRAME
df = pd.read_csv(file, low_memory=False, encoding="ISO-8859-1", dtype='unicode')

# DATAFRAME SIZE
print("Originial shape is: \n", df.shape)
print("")

# DATAFRAME CLEANING
df = dataCleaning(df,features_to_delete)
print("")
print("Shape after cleaning: \n", df.shape)
print("")

# DATAFRAME DISTRIBUTION CHECK
checkTicketsDistribution(df)

# RELEVANT FEATURES
#features = ["created_time", "caption_text", "hashtags", "users_in_photo","items", "num_of_followers"]
feature_to_predict = ["was_liked"]
features = ["Date","Caption","Hashtags","num_of_followers","items"]

# CONVERSION TO LISTS
X = df[features].values
y = df[feature_to_predict].values

# COUNTVECTORIZER
vect = TfidfVectorizer()

#TRAINING SET
print("-------------------FITTING AND TRANSFORMING TRAINING SET-------------------")
#to_fit_transform = ["caption_text","hashtags","users_in_photo","items"]
to_fit_transform = ["Caption","Hashtags","items"]

total_data = df
total_data = total_data.drop(['was_liked'],axis=1) # Dropped label column 

for feature in to_fit_transform:
    total_data[feature] = list((vect.fit_transform(total_data[feature])))
    total_data[feature] = total_data[feature].astype(str)  # Converting every compressed metric data to hashable string format

total_data = total_data.replace(np.nan,0)
print(total_data.shape)
df_encoded = pd.get_dummies(total_data, columns=to_fit_transform, drop_first=True)
print(df_encoded.shape)
#     e = open('columnames.pickle', 'wb')
#     pickle.dump(list(df_encoded.columns), e)
#     e.close()
X_with_numerical_cols = df_encoded.values  # X with all cols converted into numeric format
X_train, y_train, X_test, y_test = splitData(df, X_with_numerical_cols, y)  # splitting data with all new numeric cols
print("{0:0.2f}% in training set".format((len(X_train)/len(df.index))*100))
print("{0:0.2f}% in test set".format((len(X_test)/len(df.index))*100))
print("")

YTrain = pd.DataFrame(y_train,columns=feature_to_predict)
YTest = pd.DataFrame(y_test,columns=feature_to_predict)

Originial shape is: 
 (9998, 23)


Shape after cleaning: 
 (9998, 6)

Number of True cases: 5214 (52.15%)
Number of False cases: 4784 (47.85%)
-------------------FITTING AND TRANSFORMING TRAINING SET-------------------
(9998, 5)
(9998, 21220)
80.00% in training set
20.00% in test set



In [23]:
#DATAFRAME DISTRIBUTION CHECK AFTER SPLITTING
print("Training Set:")
checkTicketsDistribution(YTrain)
print("")
print("Testing Set:")
checkTicketsDistribution(YTest)
print("")

print("-------------------FINISHED FITTING AND TRANSFORMING TRAINING SET-------------------")

#CHECK SETS SHAPE
print("-------------------TRAINING SET-------------------")
print(X_train.shape)
print(y_train.shape)
print("-------------------TESTING SET-------------------")
print(X_test.shape)
print(y_test.shape)

Training Set:
Number of True cases: 4161 (52.03%)
Number of False cases: 3837 (47.97%)

Testing Set:
Number of True cases: 1053 (52.65%)
Number of False cases: 947 (47.35%)

-------------------FINISHED FITTING AND TRANSFORMING TRAINING SET-------------------
-------------------TRAINING SET-------------------
(7998, 21220)
(7998, 1)
-------------------TESTING SET-------------------
(2000, 21220)
(2000, 1)


In [24]:
type(X_train)

numpy.ndarray

## Get Parameters of knn from here and put it in knn and then check accuracy of knn

In [None]:
# Grid Search for Algorithm Tuning
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
# load the diabetes datasets

# prepare a range of alpha values to test
tuned_parameters = [dict(n_neighbors = [4,5,6,7],
                         weights = ["uniform", "distance"],
                        algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute'],
                        p = [1, 2])
                   ]

# create and fit a ridge regression model, testing each alpha
model = KNeighborsClassifier()
grid = GridSearchCV(model, tuned_parameters)
grid.fit(X_train, y_train.ravel())
# print(grid)
print(grid.best_params_)
# summarize the results of the grid search
# print(grid.best_score_)
# print(grid.best_estimator_)

In [25]:
#MODEL TRAINING
print("-----------------------------------------------\n READY FOR TRAINING \n-----------------------------------------------")
#LOGISTIC REGRESSION CLASSIFIER
#lr = lm.LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
#          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
#          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
 #         verbose=0, warm_start=False)

#NAIVE BAYES CLASSIFIER
#lr = GaussianNB()

#KNN CLASSIFIER
#lr = KNeighborsClassifier(n_neighbors=7)

#RANDOM FOREST CLASSIFIER
#lr=ensemble.RandomForestClassifier(n_estimators=50)

#DECISION TREE CLASSIFIER
#lr = tree.DecisionTreeClassifier(max_depth=80)

#GRADIENT BOOSTING CLASSIFIER
lr = ensemble.GradientBoostingClassifier(n_estimators=50)


print("-----------------------------------------------\n FITITNG TRAINING SET FOR THE MODEL \n-----------------------------------------------")
lr.fit(X_train, y_train.ravel())

print("-----------------------------------------------\n PREDICTING... \n-----------------------------------------------")
#y_pred_class = lr.predict_proba(X_test)
y_pred_class_bin = lr.predict(X_test)
print(X_test)
#print(y_pred_class)
print(y_pred_class_bin)
print(y_test.ravel())

#TESTING TOOLS
print("Accuracy: ")
print(metrics.accuracy_score(y_test, y_pred_class_bin))
print("")

print("Confusion Matrix: \n")
print(metrics.confusion_matrix(y_test,y_pred_class_bin))
print("")

#false_positive_rate, true_positive_rate, _ = roc_curve(y_test,y_pred_class[:,1],pos_label="TRUE")
false_positive_rate, true_positive_rate, _ = roc_curve(y_test,y_pred_class_bin)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

print("AUC", roc_auc)

F1 = f1_score(y_test, y_pred_class_bin, average='binary')
print("F1 Score: ", F1)
    
# #DUMPING TO FILES
# dill.dump(lr, open("model.p", "wb"))
# dill.dump(vect, open("countvectorizer.p", "wb"))

-----------------------------------------------
 READY FOR TRAINING 
-----------------------------------------------
-----------------------------------------------
 FITITNG TRAINING SET FOR THE MODEL 
-----------------------------------------------
-----------------------------------------------
 PREDICTING... 
-----------------------------------------------
[['1512441198' '26402652' 0 ..., 0 0 0]
 ['1512110104' '22111608' 0 ..., 0 0 0]
 ['1513965946' '83161316' 0 ..., 0 0 0]
 ..., 
 ['1511833588' '26402652' 0 ..., 0 0 0]
 ['1514133631' '86354080' 0 ..., 0 0 0]
 ['1508943992' '24198445' 0 ..., 0 0 0]]
[0 0 1 ..., 0 1 1]
[0 0 1 ..., 0 1 1]
Accuracy: 
0.721

Confusion Matrix: 

[[603 344]
 [214 839]]

AUC 0.71675937709
F1 Score:  0.750447227191


Accuracy: 
0.6055

Confusion Matrix: 

[[703 244]
 [545 508]]

AUC 0.612387697041
F1 Score:  0.562880886427