In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
%matplotlib inline
sns.set({'figure.figsize':(16,8)})

# 2. Importation des données et des statistiques descriptives:

In [4]:
df = pd.read_csv("../dataset.csv", parse_dates=['deadline','launched'], converters={'ID': str})

In [5]:
num_col = []
text_col = []
date_col = []
for c in df.columns:
    if df[c].dtype == 'object':
        text_col.append(c)
    elif df[c].dtype == '<M8[ns]':
        date_col.append(c)
    else:
        num_col.append(c)
print(f"Le data a:\t{len(num_col)} numériques colonnes,\n\t\t{len(text_col)} type string colonne,\n\t\t{len(date_col)} type date colonne")

Le data a:	6 numériques colonnes,
		7 type string colonne,
		2 type date colonne


### Remplace des valeurs manques:

In [6]:
df.name.fillna('unknown', inplace=True)
df["usd pledged"].fillna(df['usd pledged'].mean(), inplace=True)
# re-tester des valeurs manquants
df.isna().sum()

ID                  0
name                0
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

# 5. Analyse des outliers:

In [7]:
df[df.backers == df.backers.max()]

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
187652,1955357092,Exploding Kittens,Tabletop Games,Games,USD,2015-02-20,10000.0,2015-01-20 19:00:19,8782571.99,successful,219382,US,8782571.99,8782571.99,10000.0


**On a beaucoup de outliers dans des colonnes numériques, mais on ne pourrait pas les supprimer parce que ils influent ou impacte au la réussite du projet**

- On normalise les données parce que la valeur est volumineux et afin de règler les outliers que on aborde au-dessus.

In [8]:
#for c in num_col:
#    df[c + "_norm"] = np.log(df[c] + 1)

In [9]:
df.isnull().sum()

ID                  0
name                0
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

In [10]:
df['launched_year'] = df['launched'].dt.year
df['launched_day'] = df['launched'].dt.dayofweek
df['launched_hour'] = df['launched'].dt.hour
df['launched_quarter'] = df['launched'].dt.quarter

In [11]:
df['deadline_year'] = df['deadline'].dt.year
df['deadline_day'] = df['deadline'].dt.dayofweek
df['deadline_hour'] = df['deadline'].dt.hour
df['deadline_quarter'] = df['deadline'].dt.quarter

In [12]:
pd.set_option('display.max_columns',30)

In [13]:
data_raw = df.copy()

In [14]:
col_drop = ['ID', 'name','deadline', 'launched', 'state']
cat_col = ['category', 'main_category', 'currency', 'country']

## Using LabalEncoder to transform categorials variables to numerious variables

In [15]:
from sklearn.preprocessing  import LabelEncoder
for c in cat_col:
    df[c].astype(str)
    le = LabelEncoder()
    le.fit(df[c])
    df[c] = le.transform(df[c])
df[cat_col].head()

Unnamed: 0,category,main_category,currency,country
0,108,12,5,9
1,93,6,13,22
2,93,6,13,22
3,90,10,13,22
4,55,6,13,22


In [16]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_year,launched_day,launched_hour,launched_quarter,deadline_year,deadline_day,deadline_hour,deadline_quarter
0,1000002330,The Songs of Adelaide & Abullah,108,12,5,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,9,0.0,0.0,1533.95,2015,1,12,3,2015,4,0,4
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,93,6,13,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,22,100.0,2421.0,30000.0,2017,5,4,3,2017,2,0,4
2,1000004038,Where is Hank?,93,6,13,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,22,220.0,220.0,45000.0,2013,5,0,1,2013,1,0,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,90,10,13,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,22,1.0,1.0,5000.0,2012,5,3,1,2012,0,0,2
4,1000011046,Community Film Project: The Art of Neighborhoo...,55,6,13,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,22,1283.0,1283.0,19500.0,2015,5,8,3,2015,5,0,3


In [17]:
from sklearn.model_selection import train_test_split
dt = df.copy()
y = df['state']
x = dt.drop(col_drop,axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
rf.score(x_train,y_train)

0.9941801352136481

In [20]:
rf.score(x_test,y_test)

0.8740311357004212

y_pred = rf.predict(x_test)
#taux de succès
acc = 100*metrics.accuracy_score(y_test,y_pred)
print(f"{acc:.3f}%")

## One hot encoding:

In [21]:
df = pd.get_dummies(data_raw,columns=cat_col,drop_first=True)
df.head()

Unnamed: 0,ID,name,deadline,goal,launched,pledged,state,backers,usd pledged,usd_pledged_real,usd_goal_real,launched_year,launched_day,launched_hour,launched_quarter,...,country_FR,country_GB,country_HK,country_IE,country_IT,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,1000002330,The Songs of Adelaide & Abullah,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,0.0,0.0,1533.95,2015,1,12,3,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,100.0,2421.0,30000.0,2017,5,4,3,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1000004038,Where is Hank?,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,220.0,220.0,45000.0,2013,5,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,1.0,1.0,5000.0,2012,5,3,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1000011046,Community Film Project: The Art of Neighborhoo...,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,1283.0,1283.0,19500.0,2015,5,8,3,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [24]:
dt = df.copy()
y = df['state']
x = dt.drop(col_drop,axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [25]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
rf.score(x_train,y_train)

0.9937840014788992

In [27]:
rf.score(x_test,y_test)

0.8709545376520144

In [28]:
df.shape

(378661, 226)