In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os 
from datetime import datetime
import math


In [None]:
basedirectory = os.getcwd()
os.chdir(r"C:\Users\ivann\OneDrive\Documents\~General Assembly Documents")


In [None]:
raw_data = pd.read_csv("DSI_kickstarterscrape_dataset.csv", encoding ="latin-1")

In [None]:
raw_data.head()
#two things you shoul always do, dataset.info() and dataset.describe()

In [None]:
display(raw_data.info())
display(raw_data.describe())

In [None]:
#Let's update the data types on our raw data
data_updated_dtype = raw_data

In [None]:
#changing the project id from an integer to a string
data_updated_dtype['project id'] = data_updated_dtype['project id'].astype('string')


#let's put the day of the week into it's own column
data_updated_dtype[['Day of Week','funded date']] = data_updated_dtype['funded date'].str.split(',',n=1,expand=True)
#let's remove the annoying zeros at the end of the each entry
data_updated_dtype['funded date']= data_updated_dtype['funded date'].str.rstrip('-0000')
data_updated_dtype['Funded Date'] = pd.to_datetime(data_updated_dtype['funded date']).dt.date
data_updated_dtype['Funded Time'] = pd.to_datetime(data_updated_dtype['funded date']).dt.time
data_updated_dtype.head()

In [None]:
data_updated_dtype.describe()

In [None]:
#Now let's drop the original date time column
data_updated_dtype.drop(columns = ['funded date'], inplace = True)
data_updated_dtype.head()

In [None]:
data_updated_dtype.head()

In [None]:
#Let's check to see what type of data we have
data_updated_dtype.info()

In [None]:
data_updated_dtype.isna().any() #let's check for missing values in our updated dtype data set
#Expected output: 
# project id           False
# name                 False
# url                  False
# category             False
# subcategory          False
# location              True
# status               False
# goal                 False
# pledged               True
# funded percentage    False
# backers              False
# levels               False
# reward levels         True
# updates              False
# comments             False
# duration             False
# Day of Week          False
# Funded Date          False
# Funded Time          False
# dtype: bool

In [None]:
#rename two word columns to be correct
data_updated_dtype["reward_levels"]=data_updated_dtype['reward levels']


In [None]:
#remove old reward levels column
data_updated_dtype.drop(columns=['reward levels'],inplace=True, axis=1)
data_updated_dtype.head()

In [None]:
#calling the number of null values in a specific column
pledged_null = data_updated_dtype.isnull().pledged.sum()
location_null = data_updated_dtype.isnull().location.sum()
reward_null = data_updated_dtype.isnull().reward_levels.sum()
print("null location count", location_null)
print("pledged null", pledged_null)  
print ("reward levels null", reward_null)

In [None]:
#Let's drop any rows that are missing locations, since we have no way to determine what the Location should be from the data we have.
data_updated_dtype.dropna(subset=['location'],inplace=True)

#let's double check that we removed all the null values
location_null = data_updated_dtype.isnull().location.sum()
print("null location count", location_null)

In [None]:
#let's drop the rows missing reward levels since we have no way to calculate this as well.
data_updated_dtype.dropna(subset=['reward_levels'],inplace=True)

#let's double check that we removed all the null values
reward_null = data_updated_dtype.isnull().reward_levels.sum()
print ("reward levels null", reward_null)

In [None]:
#We will now attempt to update the null pldeged values using Goal amount and funded percentage
#call all rows that do not contain a pledged amount
#Let's calculate a new column called "Calculted_Pledged" and compare that with the reported pldege amounts to see if they are 
#the same.
data_updated_dtype['Calculated_Pledged']=round(data_updated_dtype['goal']*(data_updated_dtype['funded percentage']),0)
data_updated_dtype['pledgedMatch?'] = np.where(data_updated_dtype['pledged'] == data_updated_dtype['Calculated_Pledged'], 'True', 'False')  
#create new column in df1 to check if pledged amounts  match

data_updated_dtype.sample(100)


In [None]:
#Let's add in the pldged amount where appropiate in our dataset
nullpledged_df = data_updated_dtype[data_updated_dtype.isnull().pledged]
nullpledged_df.head(20) #there should only be 12 rows in the pledged null data frame
# data_updated_dtype['pledged']=round(nullpledged_df['goal']*(nullpledged_df['funded percentage']),0)
# pledged_null = data_updated_dtype.isnull().pledged.sum()
# print("pledged null", pledged_null)  



In [None]:
#Let's attempt to fill in the values for this temporary data frame
#the following code snippet was me testing how to use the np.where function, which essentialy acts like an if-then function.
#The function worked the way that I hoped it would
# nullpledged_df['pledged'] = np.where(nullpledged_df['pledged'].isnull(),round(nullpledged_df['goal']*(nullpledged_df['funded percentage']),0),nullpledged_df['pledged'])
# nullpledged_df.head(12)


In [None]:
#Use the numpy "where" function to replace NAN values in the data set with by calculating pledged amount from fundraising goal * pledged percentage.
data_updated_dtype['pledged'] = np.where(data_updated_dtype['pledged'].isnull(),round(data_updated_dtype['goal']*(data_updated_dtype['funded percentage']),0),data_updated_dtype['pledged'])
pledged_null = data_updated_dtype.isnull().pledged.sum()
#the expected output is 0. 
print("pledged null", pledged_null) 

In [None]:
#create a new dataset to be used for further data cleaning
#We need to drop the extra columns we created, i.e. pledgedmatch? and calculated pledge
data_updated_dtype.drop(columns= ['Calculated_Pledged','pledgedMatch?'], inplace=True)
no_nulls_data= data_updated_dtype
no_nulls_data.head()



In [None]:
#Next Step: Let's make sure all categories are unique and consistent
print(no_nulls_data.category.unique())
print(no_nulls_data.groupby(['category'])['category'].count())
#From our output, we see that Film & Video is written two ways, Film & Video + Film &amp; Video
#The total expected amount of listings that fall in the Film & video category is 13,082
#Expected output
# category
    # Art                  3872
    # Comics               1034
    # Dance                 744
    # Design               1738
    # Fashion              1117
    # Film & Video          482
    # Film &amp; Video    12600
    # Food                 1411
    # Games                1689
    # Music               10671
    # Photography          1424
    # Publishing           4585
    # Technology            774
    # Theater              2451

In [None]:
#Relace all instances of Film &amp; Video with Film & Video
no_nulls_data['category'] = np.where(no_nulls_data['category'] =='Film &amp; Video','Film & Video',no_nulls_data['category'])
print(no_nulls_data.groupby(['category'])['category'])


In [None]:
#let's identify the unique values of our subcategories
# print(no_nulls_data.subcategory.unique())
print(no_nulls_data.groupby(['subcategory'])['subcategory'].count())
# print(no_nulls_data.loc[no_nulls_data['subcategory'] == 'Board & Card Games'].count())


In [None]:
#Let's redefine Board &amp; Card Games,  Country &amp; Folk  ,  Film &amp; Video
no_nulls_data['subcategory'] = np.where(no_nulls_data['subcategory'] =='Film &amp; Video','Film & Video',no_nulls_data['subcategory'])
no_nulls_data['subcategory'] = np.where(no_nulls_data['subcategory'] =='Country &amp; Folk','Country & Folk',no_nulls_data['subcategory'])
no_nulls_data['subcategory'] = np.where(no_nulls_data['subcategory'] =='Board &amp; Card Games','Board & Card Games',no_nulls_data['subcategory'])

# for item  in no_nulls_data['subcategory']:
#     if (item == 'Film &amp; Video'):
#         no_nulls_data['subcategory'][item]= ('Film & Video')
#     else:
#         item = item
    
# no_nulls_data.shape
#In order to iterate over an index I need to reindex my code.
no_nulls_data.reset_index(drop=True, inplace= True)
no_nulls_data.head(20)
# print(no_nulls_data['subcategory'][0]);
# print(no_nulls_data['subcategory'][11]);
# i = 0;
# for i in range(len(no_nulls_data['subcategory'])):
# #     print(no_nulls_data['category'][i])
#     if no_nulls_data['subcategory'][i] == 'Film &amp; Video':
#         no_nulls_data['subcategory'][i] ="Film & Video"
#     elif no_nulls_data['subcategory'][i] == 'Board &amp; Card Games':
#         no_nulls_data['subcategory'][i] = "Board & Card Games"
#     elif no_nulls_data['subcategory'][i] == 'Country &amp; Folk':
#         no_nulls_data['subcategory'][i] ="Country & Folk"
# no_nulls_data.column = df.column.apply(<urstuff>)
    
        
        
#     if no_nulls_data['subcategory'][i] == 'Film &amp; Video':
#         no_nulls_data['subcategory'][i] = "Film & video"
#     else:
#         no_nulls_data['subcategory'][i] = no_nulls_data['subcategory'][i]
# no_nulls_data.head(20)
#Print statement to make sure everything was renamed properly
#print(no_nulls_data.subcategory.unique())
no_nulls_data.head()




In [None]:
#Let's remove extra columns if possible
# print(no_nulls_data['pledgedMatch?'].is())
# print(no_nulls_data['goal'].count())

In [None]:
#Rename no_null_data to the final dataset since everything is finally cleaned
#Convert the the decimal repesentation of funded percentage to actually correspond with the percentage
no_nulls_data['funded percentage']= no_nulls_data['funded percentage'].apply(lambda x: x*100)
final_data=no_nulls_data

In [None]:
#Let's describe our data set 
round(final_data.describe(),2)
#             goal   	pledged  	funded percentage	backers 	levels  	updates 	comments	duration
# count	4.459200e+04	4.459200e+04	44592.000	44592.000  	44592.000	44592.000	44592.000	44592.000
# mean	1.210894e+04	5.104144e+03	165.889 	71.038  	8.047	4.069	8.526	39.628
# std	1.916062e+05	5.769624e+04	7634.553	698.999  	4.247	6.404	176.651	17.087
# min	5.000000e-01	0.000000e+00	0.000   	0.000   	1.000	0.000	0.000	1.000
# 25%	1.800000e+03	2.000000e+02	4.500   	5.000   	5.000	0.000	0.000	30.000
# 50%	4.000000e+03	1.326000e+03	100.000 	23.000  	8.000	2.000	0.000	31.605
# 75%	1.000000e+04	4.205000e+03	111.667 	60.000  	10.000	6.000	3.000	47.020
# max	2.147484e+07	1.026684e+07	1506600.000	87142.000	80.000	149.000	19311.000	91.960

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [None]:
final_data['status'].unique()
final_data
#let's remove the rows that have live, cancelled, or suspended

# get names of indexes for which 



In [None]:
index_names = final_data[(final_data['status'] == "live") | (final_data['status'] == 'canceled') | (final_data['status'] == 'suspended')].index 
  
# drop these given row 
# indexes from dataFrame 
final_data.drop(index_names, inplace = True) 
final_data.reset_index(inplace=True, drop=True)

final_data
# final_data['status'].unique()
#Remove the rows that are not failed or successfull

In [None]:
#make sure the value we are trying to calculate is binary
sns.countplot(x = 'status', data = final_data)
#check, each item is either successful or failed


In [None]:
#let's drop the reward levels column and the location column to make things easier for us.
#we are removing project name because I currently have no way to quanitify how good or bad a project name when read by the audience, therefore, making it impossible for me to currently estimate it's impact.
logreg_data = final_data.drop(columns=['location','reward_levels','url','project id','name','Day of Week','Funded Date','Funded Time'])
logreg_data

In [None]:
#I am also removing funding percentage and pledged ammounts because they highly correlate with whether or not something was successful
logreg_data = logreg_data.drop(columns=['funded percentage', 'pledged'])
logreg_data

In [None]:
dummy_data = pd.get_dummies(logreg_data,columns=['category','subcategory','status'],drop_first = True)

In [None]:
# pd.set_option("max_columns", 70) #Showing only two columns
dummy_data

In [None]:
sns.heatmap(dummy_data.corr())
#Should I look into mutual information score or PCA?


In [None]:
#Let's look at the numbers associated with correlation, the closer to 1 a pair is, the more statistically correlated they are.
correlation_matrix=dummy_data.corr()
correlation_matrix

In [None]:
correlation_matrix.loc[correlation_matrix['category_Design'] > 0.2,['category_Design']]
#we see that design subcategories are highly correlated to their category of design, this makes sense a projects limits the what subcategories a project could fit into.
#Therefore, I'm going to drop the category attribute while running my logistic regression



In [None]:
#pull out the values that we need to remove from the dummy data
category_list= correlation_matrix.filter(regex = '^category').iloc[0,:]

In [None]:
#Take the category dataframe we created above and transform it into a list.
category_columns_to_drop= list(category_list.index.values)
category_columns_to_drop

In [None]:
#Let's drop time of day and funded date because I don't know how to account for these things yet
dummy_data= dummy_data.drop(columns= category_columns_to_drop)

In [None]:
#dummy_data.columns
#Double check to make sure category has been removed sucessfully, given our output we are given what we expect


In [None]:
X_train, X_test, y_train, y_test = train_test_split(dummy_data.drop('status_successful', axis=1),
                                                   dummy_data['status_successful'], test_size=0.2,
                                                   random_state=200)

In [None]:
print(X_train.shape)
print(X_train)


In [None]:
print(y_train.shape)
print(y_train)

In [None]:
##Apply feature scaling to the model
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Look up what these functions do
LogReg = LogisticRegression(solver='liblinear')
LogReg.fit(X_train, y_train)

In [None]:
#Look up what these functions do
y_pred = LogReg.predict(X_test)

In [None]:
## Model Evaluation
### Classification report without cross-validation
print(classification_report(y_test, y_pred))

In [None]:
### K-fold cross-validation & confusion matrices
#calculate the prediction score for your training set
y_train_pred = cross_val_predict(LogReg, X_train, y_train, cv=5)
confusion_matrix(y_train, y_train_pred)

In [None]:
precision_score(y_train, y_train_pred)

In [None]:
## Model Evaluation
### Classification report with cross-validation
print(classification_report(y_train, y_train_pred))

In [None]:
y_pred_proba = LogReg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
print('Coefficient Matrix')
print(LogReg.coef_)
# coefficients = pd.Dataframe
print('')
print('Training Data column names')
print(dummy_data.columns)


In [None]:
#The coefficient dataframe from the logistic regression model does not have column names associated with it, 
# which makes it hard to quickly identify which variable corresponds to which weight.
# 
# Fortunately, the coefficients are created in the order in which the logistic regression model received them, 
# which our dummy_data data frame was used to determine.
# So we can combine the coefficent matrix and use the column names from dummy_data to create a dataframe of variable weights.

column_names = dummy_data.drop('status_successful', axis=1).columns #we have to drop the 'status_successful' column because it was originally included in the dummy_data data frame. there is no weight assigned to "status_successful" becasue that's the variable we are trying to calculate.

print(column_names)

#create a new list of column names that shortens the subcategory names
new_column_names= []

for column in column_names:
    new_name = column.replace("subcategory",'sc')
    new_column_names.append(new_name)
    # print(new_column_names) Used for testing our new list

# subcat_list = ['goal','backers', 'levels', 'updates', 'comments', 'duration', 'sc_Art', 'sc_Art Book', 'sc_Board & Card Games', 'sc_Childrens Book', 'sc_Classical Music', 'sc_Comics', 'sc_Conceptual Art', 
# 'sc_Country & Folk', 'sc_Crafts', 'sc_Dance', 'sc_Design', 'sc_Digital Art', 'sc_Documentary', 
# 'sc_Electronic Music', 'sc_Fashion', 'sc_Fiction', 'sc_Film & Video', 'sc_Food', 'sc_Games',
# 'sc_Graphic Design', 'sc_Hip-Hop', 'sc_Illustration', 'sc_Indie Rock', 'sc_Jazz', 'sc_Journalism', 
# 'sc_Mixed Media', 'sc_Music', 'sc_Narrative Film', 'sc_Nonfiction', 'sc_Open Hardware', 'sc_Open Software',
# 'sc_Painting', 'sc_Performance Art', 'sc_Periodical', 'sc_Photography', 'sc_Poetry', 'sc_Pop', 'sc_Product Design', 
# 'sc_Public Art', 'sc_Publishing', 'sc_Rock', 'sc_Sculpture', 'sc_Short Film', 'sc_Technology', 'sc_Theater', 'sc_Video Games',
# 'sc_Webseries', 'sc_World Music']


# print(subcat_list)

In [None]:
##Creation of our coefficient dataframe
coeff_df = pd.DataFrame(LogReg.coef_, columns = new_column_names)
coeff_df.head()

In [None]:
#Lets reshape the matrix so that it's one column and multiple rows.
coeff_df_transpose= coeff_df.transpose().rename(columns={0:"Weights with Backers"}, errors="raise")
print(coeff_df_transpose)
#next step, let's sort the rows by their value

In [None]:
coeff_df_transpose.sort_values('Weights with Backers')
#backers has the highest positive weight of 38,
#Comments had the positive wieght of 4,
#updates had a positive weight of 1.207
#Goal amount had a negative weight of -32.5, followed by duration at -0.2863


In [None]:
#THIS LINE OF CODE IS NO LONGER APPLICABLE SINCE WE REMOVED CATEGORIES FROM OUR MODEL
#Pull out category weights 
# category_weights = coeff_df_transpose.filter(regex = "^category", axis=0)
# category_weights.sort_values('Weights with Backers', ascending = False)
# #Having the category of either Music, theater, or dance had a net positive effect,
# #while everything else had a negative effect on the ability to succeed.


In [None]:
#A function to easily generate an array that has the value set to 1 only for the category you want. goal amount, levels, and updates 
def Backers_TestCaseGen(goal, backers, levels, updates, comments, duration, desired_subcategory):
    parameter_list = ['goal', 'backers','levels', 'updates', 'comments', 'duration', 'sc_Art','sc_Art Book', 'sc_Board & Card Games', 'sc_Childrens Book','sc_Classical Music', 'sc_Comics', 'sc_Conceptual Art','sc_Country & Folk', 'sc_Crafts', 'sc_Dance', 'sc_Design','sc_Digital Art', 'sc_Documentary', 'sc_Electronic Music', 'sc_Fashion','sc_Fiction', 'sc_Film & Video', 'sc_Food', 'sc_Games','sc_Graphic Design', 'sc_Hip-Hop', 'sc_Illustration', 'sc_Indie Rock','sc_Jazz', 'sc_Journalism', 'sc_Mixed Media', 'sc_Music','sc_Narrative Film', 'sc_Nonfiction', 'sc_Open Hardware','sc_Open Software', 'sc_Painting', 'sc_Performance Art','sc_Periodical', 'sc_Photography', 'sc_Poetry', 'sc_Pop','sc_Product Design', 'sc_Public Art', 'sc_Publishing', 'sc_Rock','sc_Sculpture', 'sc_Short Film', 'sc_Technology', 'sc_Theater','sc_Video Games', 'sc_Webseries', 'sc_World Music']
    pointer = parameter_list.index(desired_subcategory)
    generated_test_case = np.zeros(len(parameter_list))
    generated_test_case[pointer] = 1
    generated_test_case[0] = goal
    generated_test_case[1] = backers
    generated_test_case[2] = levels
    generated_test_case[3] = updates
    generated_test_case[4] = comments
    generated_test_case[5] = duration
    return sc.transform(generated_test_case.reshape(1,-1))

In [None]:
# #Pull out subcategory weights
# subcategory_weights = coeff_df_transpose.filter(regex = "^subcategory", axis=0)
# subcategory_weights.sort_values('Weights with Backers', ascending = False)
# #Short film had the most postiive effect

In [None]:
#Currently plotting just how success rate changes given the amount of money you raise



plt.clf()

gta = 20000 # gta = goal test amount

fig = plt.figure(figsize =(10, 10))


sub1 = plt.subplot(1, 2, 1)
sub2 = plt.subplot(1, 2, 2)

predicted_success = np.zeros(gta)

for x in range(gta):
    test_case = Backers_TestCaseGen(x,0,0,0,0,0,'sc_Theater')
    predicted_success[x] = LogReg.predict_proba(test_case.reshape(1,-1))[0][1]
x_values = list(range(gta))    
y = predicted_success.reshape(-1,1)
sub1.plot(x_values, y)


# as we can see, we see an extreme drop off once we get a threshold value

for x in range(gta):
    test_case = Backers_TestCaseGen(x,5,5,5,5,0,'sc_Theater')
    predicted_success[x] = LogReg.predict_proba(test_case.reshape(1,-1))[0][1]
x_values = list(range(gta))    
y = predicted_success.reshape(-1,1)
sub2.plot(x_values, y)

### Lets see what we get when you pull-out category and the number of backers from our model ###

In [None]:
#Get dummies is function built into pandas that will create binary columns for us for categorical variables.

dummy_data = pd.get_dummies(logreg_data,columns=['subcategory','status'],drop_first = True)
#Let's drop time of day and funded date because I don't know how to account for these things yet
#let's also get rid of backers because of how strong of an impact it had on everything. 
#Let's get rid of category to see just how important subcategory is for this analysis
dummy_data= dummy_data.drop(columns= ["backers",'category'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dummy_data.drop('status_successful', axis=1),
                                                   dummy_data['status_successful'], test_size=0.2,
                                                   random_state=200)
print(X_train.shape)
print(X_train)


In [None]:
print(y_train.shape)
print(y_train)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
LogReg = LogisticRegression(solver='liblinear',class_weight = 'balanced')
LogReg.fit(X_train, y_train)
y_pred = LogReg.predict(X_test)

In [None]:
## Model Evaluation
### Classification report without cross-validation
print(classification_report(y_test, y_pred))

In [None]:
### K-fold cross-validation & confusion matrices
#calculate the prediction score for your training set
y_train_pred = cross_val_predict(LogReg, X_train, y_train, cv=5)
confusion_matrix(y_train, y_train_pred)

In [None]:
precision_score(y_train, y_train_pred)

In [None]:
#Classification report with cross-validation
print(classification_report(y_train, y_train_pred))

In [None]:
#Create an ROC curve to assess the tradeoff between sensitivity and specificity
y_pred_proba = LogReg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
#Save my coefficients into a pretty dataframe
column_names = (dummy_data.drop('status_successful', axis=1).columns)
hardcode_columns = ['goal', 'levels', 'updates', 'comments', 'duration', 'sc_Art',
'sc_Art Book', 'sc_Board & Card Games',
'sc_Childrens Book', 'sc_Classical Music',
'sc_Comics', 'sc_Conceptual Art',
'sc_Country & Folk', 'sc_Crafts', 'sc_Dance',
'sc_Design', 'sc_Digital Art',
'sc_Documentary', 'sc_Electronic Music',
'sc_Fashion', 'sc_Fiction',
'sc_Film & Video', 'sc_Food', 'sc_Games',
'sc_Graphic Design', 'sc_Hip-Hop',
'sc_Illustration', 'sc_Indie Rock',
'sc_Jazz', 'sc_Journalism', 'sc_Mixed Media',
'sc_Music', 'sc_Narrative Film',
'sc_Nonfiction', 'sc_Open Hardware',
'sc_Open Software', 'sc_Painting',
'sc_Performance Art', 'sc_Periodical',
'sc_Photography', 'sc_Poetry', 'sc_Pop',
'sc_Product Design', 'sc_Public Art',
'sc_Publishing', 'sc_Rock', 'sc_Sculpture',
'sc_Short Film', 'sc_Technology',
'sc_Theater', 'sc_Video Games',
'sc_Webseries', 'sc_World Music']

no_backers_coeff_df = pd.DataFrame(LogReg.coef_, columns = hardcode_columns)





no_backers_coeff_df = no_backers_coeff_df

print(no_backers_coeff_df.columns)

In [None]:
df2 = {'backers': 'NA'}
no_backers_coeff_df = no_backers_coeff_df.append(df2,ignore_index=True)

# print(no_backers_coeff_df)

In [None]:
#Transpose the dataframe to make it easier to read


# print(no_backers_coeff_df)
no_backers_coeff_df_transpose= no_backers_coeff_df.transpose()

# print(no_backers_coeff_df_transpose)


no_backers_coeff_df_transpose.drop(columns = [1],inplace=True)




no_backers_coeff_df_transpose= no_backers_coeff_df_transpose.rename(columns={0:"Weights with No Backers"}, errors="raise")
# no_backers_coeff_df_transpose.sort_values('Weights with No Backers')

In [None]:
#Let's take the weights from the training model before and after we removed # of backers and categories from the equation
#Let's see how it differs.
weight_compare = pd.merge(coeff_df_transpose, no_backers_coeff_df_transpose, left_index= True, right_index=True)


weight_comparison = weight_compare.sort_values(by = 'Weights with Backers',inplace= False)
# print(weight_comparison)


In [None]:
#Currently attempting of converting weights into their respective probabilities

weight_comparison['Weights with Backers'][0]
# np.exp(weight_comparison['Weights with Backers'][0])
print(weight_comparison['Weights with No Backers']['backers'])

print(np.exp(weight_comparison['Weights with No Backers']['backers']))

In [None]:
#Recalculate coefficients to represent odds rather than the logit
#this is calcualted using odds = exp(coefficient)

# odds_comparison = weight_comparison
# odds_comparison['Weights with Backers'] = math.exp(odds_comparison['Weights with Backers'])
# print(odds_comparison.iloc[0,0])

# odds_comparison = pd.DataFrame(columns = ['Weights with Backers', 'Weights With No Backers'])
# # odds_comparison.head()
# for x in weight_comparison['Weights with Backers']:
# #     print(x)
#     weight_comparison['Odds with Backers'] = math.exp(x)
#     print(weight_comparison['Odds with Backers'])
    

# odds_comparison = weight_comparison.copy()
weight_comparison['Odds with Backers'] = np.exp(weight_comparison['Weights with Backers'])


# weight_comparison['Odds with No Backers'] = np.exp(weight_comparison['Weights with No Backers'])


weight_comparison.iloc[:,0:2]

In [None]:
plt.clf()

a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
plt.xticks(rotation=90)
sns.barplot(x= weight_comparison['Weights with Backers'].index, y=  weight_comparison['Weights with Backers'])

In [None]:
#Let's do a Grouped chart with all the variables
#As expected the chart is difficult to read due to the large difference in values, so we are going to create two additional charts,
#one chart will only include subcategory weights while the other will have everything else.


plt.clf()
labels = weight_comparison.index

new_labels =[]
for item in labels:
    new_labels.append(item.lstrip('sc_'))

# print(new_labels)

x = np.arange(len(new_labels))  # the label locations
width = 0.35  # the width of the bars
a4_dims = (11.7, 8.27)

fig, ax = plt.subplots(figsize=a4_dims)


rects1 = ax.bar(x - width/2,  weight_comparison['Weights with Backers'], width, label='Weights that Include backers', color = 'violet')
rects2 = ax.bar(x + width/2, weight_comparison['Weights with No Backers'], width, label='Weights W/o Backers', color = 'purple')

plt.xticks(rotation=90)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Attribute Weight')
ax.set_title('Logistic Regression Weight by Attribute')
ax.set_xticks(x)
ax.set_xticklabels(new_labels)
ax.legend()

In [None]:
#let's pull out all the values associated only with the subcategory weights

final_weights_subcategory = weight_comparison.filter(like='_', axis =0)

plt.clf()
labels = final_weights_subcategory.index

new_labels = []

for label in labels:
    new_labels.append(label.lstrip('sc_'))

x = np.arange(len(new_labels))  # the label locations
width = 0.35  # the width of the bars
a4_dims = (14, 8.27)

fig, ax = plt.subplots(figsize=a4_dims)


rects1 = ax.bar(x - width/2, final_weights_subcategory['Weights with Backers'], width, label='Calculated Weights with Backers', color = 'violet')
rects2 = ax.bar(x + width/2, final_weights_subcategory['Weights with No Backers'], width, label='Calculated Weights w/o Backers', color = 'purple')

plt.xticks(rotation=90)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Attribute Weight')
ax.set_title('Weights of Subcategories on Logistic Regression Model')
ax.set_xticks(x)
ax.set_xticklabels(new_labels)
ax.legend()

In [None]:
#We graphed all the subcategories above
#Let's go ahead and graph our values for the non subcategories entries
#let's pull out all the values associated only with the subcategory weights

final_weights_subcategory = weight_comparison.filter(items=['goal','duration','levels','updates','comments','backers'], axis =0 )

plt.clf()
labels = final_weights_subcategory.index


x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars
a4_dims = (11.7, 8.27)

fig, ax = plt.subplots(figsize=a4_dims)


rects1 = ax.bar(x - width/2, final_weights_subcategory['Weights with Backers'], width, label='Weights that Include backers', color = 'violet')
rects2 = ax.bar(x + width/2, final_weights_subcategory['Weights with No Backers'], width, label='Weights W/o Backers', color = 'purple')


# Add some text for labels, title and custom x-axis tick labels, etc.
plt.grid(color = 'grey', linestyle = '--', linewidth = 0.5, axis = 'y')

ax.set_ylabel('Attribute Weight')
ax.set_title('Weights of Non-Subcategories on Logistic Regression Model')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

### Creating predictions for hypothetical projects using Weights for No Backers



In [None]:
# To predict probability of a project working, run the line of code "LogReg.predict_proba([List of parameter values])
# Use the function TestCaseGen to create an array of parameter values to use with LogReg.predict_proba


In [None]:
#A function to easily generate an array that has the value set to 1 only for the category you want. goal amount, levels, and updates 
def NoBackers_TestCaseGen(goal, levels, updates, comments, duration, desired_subcategory):
    parameter_list = ['goal', 'levels', 'updates', 'comments', 'duration', 'sc_Art','sc_Art Book', 'sc_Board & Card Games', 'sc_Childrens Book','sc_Classical Music', 'sc_Comics', 'sc_Conceptual Art','sc_Country & Folk', 'sc_Crafts', 'sc_Dance', 'sc_Design','sc_Digital Art', 'sc_Documentary', 'sc_Electronic Music', 'sc_Fashion','sc_Fiction', 'sc_Film & Video', 'sc_Food', 'sc_Games','sc_Graphic Design', 'sc_Hip-Hop', 'sc_Illustration', 'sc_Indie Rock','sc_Jazz', 'sc_Journalism', 'sc_Mixed Media', 'sc_Music','sc_Narrative Film', 'sc_Nonfiction', 'sc_Open Hardware','sc_Open Software', 'sc_Painting', 'sc_Performance Art','sc_Periodical', 'sc_Photography', 'sc_Poetry', 'sc_Pop','sc_Product Design', 'sc_Public Art', 'sc_Publishing', 'sc_Rock','sc_Sculpture', 'sc_Short Film', 'sc_Technology', 'sc_Theater','sc_Video Games', 'sc_Webseries', 'sc_World Music']
    generated_test_case = np.zeros(len(parameter_list))
    generated_test_case[0] = goal
    generated_test_case[1] = levels
    generated_test_case[2] = updates
    generated_test_case[3] = comments
    generated_test_case[4] = duration
    if desired_subcategory != 'sc_Animation':
        pointer = parameter_list.index(desired_subcategory)
        generated_test_case[pointer] = 1
    
    return sc.transform(generated_test_case.reshape(1,-1))
    


In [None]:
# print(results)
# results
NoBackers_TestCaseGen(0,0,0,0,0,'sc_Animation')


In [None]:
#Currently plotting just how success rate changes given the amount of money you raise



plt.clf()

gta = 10000 # gta = goal test amount

fig = plt.figure(figsize =(10, 10))


sub1 = plt.subplot(1, 2, 1)
sub2 = plt.subplot(1, 2, 2)

predicted_success = np.zeros(gta)

for x in range(gta):
    test_case = NoBackers_TestCaseGen(x,0,0,0,0,'sc_Theater')
    predicted_success[x] = LogReg.predict_proba(test_case.reshape(1,-1))[0][1]
x_values = list(range(gta))    
y = predicted_success.reshape(-1,1)
sub1.plot(x_values, y)


# as we can see, we see an extreme drop off once we get a threshold value

for x in range(gta):
    test_case = NoBackers_TestCaseGen(x,5,5,5,0,'sc_Theater')
    predicted_success[x] = LogReg.predict_proba(test_case.reshape(1,-1))[0][1]
x_values = list(range(gta))    
y = predicted_success.reshape(-1,1)
sub2.plot(x_values, y)

### Updating my logistic regression model to not have outliers

Originally I was having issues getting accurate results from my model and I assumed that it was because of the outliers in the data. It turns out that I wasn't properly scalling my input features like I needed to! 

In [None]:
# quartile_range= (final_data['goal'].percentile(75) - final_data['goal'].percentile(75))
iq_range= np.percentile(dummy_data['goal'],75) - np.percentile(dummy_data['goal'],25)
outlier_limit = 1.5*iq_range +np.percentile(dummy_data['goal'],75)
print(outlier_limit)

In [None]:
no_outlier_dd = dummy_data.loc[dummy_data['goal'] <= (2*outlier_limit)].copy()


In [None]:
no_outlier_dd

In [None]:
X_train, X_test, y_train, y_test = train_test_split(no_outlier_dd.drop('status_successful', axis=1),
                                                   no_outlier_dd['status_successful'], test_size=0.2,
                                                   random_state=200)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
no_outlier_logreg = LogisticRegression(solver='lbfgs')
no_outlier_logreg.fit(X_train, y_train)
y_pred = no_outlier_logreg.predict(X_test)

In [None]:
# print(y_pred)
sc.mean_

In [None]:
## Model Evaluation
### Classification report without cross-validation
print(classification_report(y_test, y_pred))

In [None]:
### K-fold cross-validation & confusion matrices
#calculate the prediction score for your training set
y_train_pred = cross_val_predict(no_outlier_logreg, X_train, y_train, cv=5)
confusion_matrix(y_train, y_train_pred)

#Classification report with cross-validation
print(classification_report(y_train, y_train_pred))

In [None]:
#Create an ROC curve to assess the tradeoff between sensitivity and specificity
y_pred_proba = no_outlier_logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
print(no_outlier_logreg.coef_)
coed_df= no_outlier_dd.drop('status_successful',axis=1).columns
print(coed_df)

In [None]:
##Creation of our coefficient dataframe
no_outlier_coeff_df = pd.DataFrame(no_outlier_logreg.coef_, columns = coed_df)
no_outlier_coeff_df.head()


#updated weights make a ton of sense with comments being the strongest indicator or project success, this makes sense because comments can be used as a secondary effect measure of project popularity.


### Let's now test to see if these weight actually produce accruate results

In [None]:
no_outlier_dd

In [None]:
###### Subplot example ######

# fig = plt.figure(figsize=(20, 10))
# print(fig)

# fig.suptitle('Subplot example3-1: Add subplot later', fontsize=20)

# # Add plots
# ax1 = fig.add_subplot(1, 3, 1)
# ax1.plot(x, y)
# ax1.set_xlabel('X label, plot1')
# ax1.set_ylabel('Y label, plot1')
# ax1.set_xticklabels('')
# ax1.set_yticklabels('')


In [None]:
#Currently plotting just how success rate changes given the amount of money you raise
plt.clf()


gta = 20000 # gta = goal test amount
lta = 10 #cta = levels test ammount
uta = 10 #uta = update test ammount
cta = 10 #cta = comment test ammount

fig = plt.figure(figsize=(20, 10))
fig.suptitle('Probability Calculations based on Non-Outlier Model', fontsize=20)


ax1 = fig.add_subplot(1,3,1)
ax1.set(xlim=[0,gta],ylim=[0,1.0])
ax1.set_xlabel('Goal $ Amount',fontsize=18)


ax2 = fig.add_subplot(1,3,2)
ax2.set(xlim=[0,gta],ylim=[0,1.0])
ax2.set_xlabel('Goal $ Amount',fontsize=18)


ax3 = fig.add_subplot(1,3,3)
ax3.set(xlim=[0,gta],ylim=[0,1.0])
ax3.set_xlabel('Goal $ Amount',fontsize=18)



predicted_success = np.zeros(gta)

x_values = list(range(gta))    


i = 0; 
for levels in range(lta):
    for x in range(gta):
        test_case = NoBackers_TestCaseGen(x,levels,0,0,0,'sc_Art')
        predicted_success[x] = no_outlier_logreg.predict_proba(test_case.reshape(1,-1))[0][1]
    y = predicted_success
    ax1.plot(x_values, y, label = 'Levels =' + " " + str(i))
    i += 1


# #as we can see, we see an extreme drop off once we get a threshold value
i = 0; 
for updates in range(uta):
    for x in range(gta):
        test_case = NoBackers_TestCaseGen(x,0,updates,0,0,'sc_Art')
        predicted_success[x] = no_outlier_logreg.predict_proba(test_case.reshape(1,-1))[0][1]
    y = predicted_success
    ax2.plot(x_values, y, label = 'Updates =' + " " + str(i))
    i += 1

i = 0; 
for comments in range(cta):
    for x in range(gta):
        test_case = NoBackers_TestCaseGen(x,0,0,comments,0,'sc_Art')
        predicted_success[x] = no_outlier_logreg.predict_proba(test_case.reshape(1,-1))[0][1]
    y = predicted_success
    ax3.plot(x_values, y,label = 'Comments =' + " " + str(i))
    i += 1

ax1.legend(loc = 'best')
ax2.legend(loc = 'best')
ax3.legend(loc = 'best')

# fig.tight_layout()


In [None]:
test_case= NoBackers_TestCaseGen(1,0,0,0,0,'sc_Art')
# print(no_outlier_logreg.predict_proba(test_case.reshape(1,-1))[0][1])


test_case1= NoBackers_TestCaseGen(1,0,0,0,0,'sc_Art')
print(no_outlier_logreg.predict_proba(test_case1.reshape(1,-1))[0][1])

for x in range(10):
    test_case = NoBackers_TestCaseGen(x,0,0,0,0,'sc_Art')
    predicted_success[x] = no_outlier_logreg.predict_proba(test_case.reshape(1,-1))[0][1]
    print(x)
print(predicted_success.reshape(-1,1))


In [None]:
i = 0; 
limit = 10
gta = 5
comment_success = np.zeros(gta)
update_success = np.zeros(gta)

for index in range(limit):
    print(index)
    for x in range(gta):
        comments_tc = NoBackers_TestCaseGen(x,0,0,index,0,'sc_Art')
        comment_success[x] = no_outlier_logreg.predict_proba(comments_tc.reshape(1,-1))[0][1]
        updates_tc = NoBackers_TestCaseGen(x,0,index,0,0,'sc_Art')
        update_success[x] = no_outlier_logreg.predict_proba(updates_tc.reshape(1,-1))[0][1]
    y = comment_success
    y1 = update_success
    # ax2.plot(x_values, y, '-b',label = 'Comments =' + " " + str(i))
    print("Comments")
    print(y)
    print("Updates")
    print(update_success)

In [None]:
print(NoBackers_TestCaseGen(0,0,1,0,0,'sc_Art') - NoBackers_TestCaseGen(0,0,0,0,0,'sc_Art'))
print(NoBackers_TestCaseGen(0,0,0,1,0,'sc_Art') - NoBackers_TestCaseGen(0,0,0,0,0,'sc_Art'))

In [None]:
print(dummy_data.columns)
len(dummy_data.columns) - 5

In [None]:
print(final_data['subcategory'].sort_values().unique())