In [17]:
# Data management
import pandas as pd
import json
from datetime import datetime

In [18]:
# date unix conversion
def date_f(unix_d):
    d = datetime.utcfromtimestamp(unix_d).strftime('%Y-%m-%d %H:%M:%S')
    d1 = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
    return d1

# cross-validation model
def valuta_modello(X, y, model, n_splits=10, shuffle=True):
    # non avendo comunque un grande sbilanciamento procedo solo con la StratifiedKfold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle)
    s = ['accuracy', 'precision', 'recall', 'f1']
    print('\n {}'.format(skf))
    scores = cross_validate(model, X, y, scoring=s, cv=skf, n_jobs=-1)
    y_train_predicted = cross_val_predict(model, X, y, cv=skf)
    for j in s:
        print('>scoring={} mean={:.4f} se={:.3f}'.format(j, np.mean(scores[('test_'+ j)]), sem(scores[('test_'+ j)])))
    print('\n confusion matrix \n {}'.format(confusion_matrix(y, y_train_predicted)))

# embedding sentences
def embedding_text(t, model):
    blurb_l = t.values.tolist()
    sentence_embeddings = np.asmatrix(model(blurb_l).numpy())
    return sentence_embeddings

In [15]:
data = [json.loads(line) for line in open('data3/Kickstarter_2021-01-14T03_20_05_328Z.json', 'r', encoding="utf8")]
data = pd.json_normalize(data)

In [19]:
# keep only the useful columns
ks = data.iloc[: , [3, 13, 14, 15, 17, 25, 28, 32, 59, 64, 68]]

# rename columns
ks = ks.rename(lambda x: x.replace(".", "_"), axis=1)
ks = ks.rename(lambda x: x.replace("data_", ""), axis=1)

# remove records in the columns 'state' with value 'live' and 'canceled'
ks = ks[ks.state != 'live']
ks = ks[ks.state != 'canceled']

# remove records in the columns 'id' with value duplicate
ks = ks.drop_duplicates('id')

# conversion of unix date
ks['deadline'] = ks['deadline'].apply(date_f)
ks['launched_at'] = ks['launched_at'].apply(date_f)

# create a column with only the days relating to the funding period
ks['funding_period'] = (ks['deadline']-ks['launched_at']).dt.days

# create two columns with only the months related to the date of columns 'deadline' and 'launched_at'
ks['deadline_month'] = ks['deadline'].dt.month
ks['launched_at_month'] = ks['launched_at'].dt.month

# conversion of the 'goal' currency in USD
ks['goal'] = ks['goal']*ks['static_usd_rate']

# create two columns with the length of value in the columns 'name' and 'blurb'
ks['name_l'] = ks['name'].str.len()
ks['blurb_l'] = ks['blurb'].str.len()

# remove the null values from the column 'location_expanded_country' and reset the index
ks.dropna(subset=['location_expanded_country'], inplace=True)
ks.dropna(subset=['category_parent_name'], inplace=True)
ks = ks.reset_index(drop=True)

In [20]:
# extract the label column and remove it from the dataset along with non-useful columns
ks_label1 = ks['state'].map(
    {'failed':0,
     'successful':1
    }
)
ks1 = ks.drop(['id', 'name', 'blurb', 'state', 'deadline', 'launched_at', 'static_usd_rate'], axis=1)

In [21]:
ks1.columns = ['goal', 'country', 'main_category', 'sec_category', 'funding_period', 'deadline_month', 'launched_month', 'name_l', 'descr_l']

In [22]:
ks1.head()

Unnamed: 0,goal,country,main_category,sec_category,funding_period,deadline_month,launched_month,name_l,descr_l
0,5227.0,United States,Community Gardens,Food,29,4,3,29,123
1,46282.0323,United Kingdom,Drama,Film & Video,30,8,7,15,119
2,1000.0,United States,Playing Cards,Games,30,7,6,55,133
3,1019.89432,Hong Kong,Playing Cards,Games,30,10,9,31,30
4,550.0,United States,Textiles,Art,30,2,1,27,124
