### 1. Import packages

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from langdetect import detect

In [31]:
#!pip install pymorphy2
from pymorphy2 import MorphAnalyzer
pymorphy2_analyzer = MorphAnalyzer()

### 2. Functions

#### Merge data

In [1]:
import os

In [3]:
def concat_data(folder):
    files = os.listdir('data/'+str(folder))
    df = pd.read_csv('data/'+str(folder)+'/'+str(files[0]))
    for i in range(len(files)):
        df2 = pd.read_csv('data/'+str(folder)+'/'+str(files[i]))
        df = pd.concat([df,df2])
    print(df.shape)
    print('/...Export the data')
    df.to_csv('./data/'+str(folder)+'/'+str(folder)+'_full')

In [None]:
#give the function the name of the folder where the samples stored
concat_data('rewards')

In [16]:
df1 = pd.read_csv('./data/projects/projects_1_800.csv')
df2 = pd.read_csv('./data/projects/projects_801_1600.csv')

In [17]:
projects = pd.concat([df1,df2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [18]:
projects.to_csv('./data/projects/projects_full.csv')

#### Clean projects

In [None]:
#upload data
projects = pd.read_csv('./data/projects/projects_full.csv')
creaters = pd.read_csv('./data/creaters/creaters_full.csv')
rewards = pd.read_csv('./data/rewards/rewards_full.csv')

In [40]:
def tokenize(text):
    lang = {'de': 'dutch',
            'en': 'english',
            'fi': 'finnish',
            'fr': 'french',
            'no': 'norwegian'}
    words = text.lower().split()
    words_normalized = []
    language = detect(text)
    if language in lang.keys():
        words = [word for word in words if word not in stopwords.words(lang[language])]

    for w in words:
        lemma = pymorphy2_analyzer.parse(w)
        words_normalized.append(lemma[0].normal_form)
    return ', '.join(words_normalized)

In [None]:
#reset_index
projects.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], inplace=True)
creaters.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], inplace=True)
rewards.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1'], inplace=True)

In [41]:
#run to clean the data
projects['funded'] = projects['funded'].apply(lambda x: ''.join(re.findall(r'\d', x))).astype('float64')
projects['target'] = projects['target'].apply(lambda x: ''.join(re.findall(r'\d', x))).astype('float64')
projects['end_date'] = pd.to_datetime(projects['end_date'])
projects['short_des'] = projects['short_des'].apply(lambda x: tokenize(str(x)))

In [None]:
#export data
projects.to_csv('./data/projects/projects_cleaned.csv')
rewards.to_csv('./data/rewards/rewards_cleaned.csv')
creaters.to_csv('./data/creaters/creaters_cleaned.csv')

In [43]:
projects.head(10)

Unnamed: 0.1,Unnamed: 0,pro_link,title,status,percentage_fund,funded,target,end_date,backers,subcategory,thumbnail_type,number_rewards,min_price,max_price,num_news,num_comments,num_contributions,creater_link,language,short_des
0,0,https://www.kisskissbankbank.com/en/projects/t...,"Cassava, table & boulangerie artisanale engagée",Successful,106,4225.0,4000.0,2021-01-12,42,"Organic,Locavore,Local Development",png,8,10.0,400.0,1,16,46,https://www.kisskissbankbank.com/en/users/vale...,fr,"participez, naissance, cassava, réunion!, mari..."
1,1,https://www.kisskissbankbank.com/en/projects/i...,IBIZA HORSE VALLEY,Successful,104,20880.0,20000.0,2021-01-12,175,Animal Welfare,png,4,10.0,500.0,6,71,190,https://www.kisskissbankbank.com/en/users/jean...,fr,"sauvons, ensemble, ""ibiza, horse, valley"",, sa..."
2,2,https://www.kisskissbankbank.com/en/projects/ob2,ENREGISTREMENT DE L'ALBUM OB2 GREGORY OTT ET V...,Successful,108,10750.0,10000.0,2021-01-12,211,Indie,jpg,7,10.0,500.0,0,53,222,https://www.kisskissbankbank.com/en/users/vinc...,hr,"soutenez, le, projet, sur, kisskissbankbank, !"
3,3,https://www.kisskissbankbank.com/en/projects/p...,Playschool l'album : LA GARDERIE,Successful,120,3595.0,3000.0,2021-01-12,95,Indie,jpg,7,10.0,1000.0,4,22,98,https://www.kisskissbankbank.com/en/users/emma...,fr,"collectif, d’artistes,, playschool, a, besoin,..."
4,4,https://www.kisskissbankbank.com/en/projects/s...,S!CK soutient l'indépendance avec le double-nu...,Successful,876,876.0,100.0,2021-01-12,312,"French manufacturing,Independent media",gif,12,14.0,225.0,10,41,325,https://www.kisskissbankbank.com/en/users/yoan...,fr,"164, pages,, 2, couvertures, 0, publicité, !"
5,5,https://www.kisskissbankbank.com/en/projects/t...,"TEASER Long-métrage ""L'APPÂT""",Successful,169,3642.0,2160.0,2021-01-11,62,"Anti racism,International solidarity,Independe...",jpg,5,5.0,80.0,0,11,64,https://www.kisskissbankbank.com/en/users/teas...,en,"soutenez, le, tournage, du, teaser, du, long-m..."
6,6,https://www.kisskissbankbank.com/en/projects/e...,En immersion Engagée,Successful,682,13639.0,2000.0,2021-01-10,72,"Local Development,Indie,French manufacturing",png,7,20.0,500.0,1,14,74,https://www.kisskissbankbank.com/en/users/loic...,fr,"pré-réservez, aujourd’hui, prochaine, micro-im..."
7,7,https://www.kisskissbankbank.com/en/projects/r...,"Route 64, la revue francophone du jeu d'échecs...",Successful,203,406.0,200.0,2021-01-10,172,Independent media,jpg,7,25.0,5000.0,26,34,177,https://www.kisskissbankbank.com/en/users/fabr...,fr,"soutenez, création, route, 64,, nouveau, média..."
8,8,https://www.kisskissbankbank.com/en/projects/c...,"Caremitou®, maison de santé connectée qui pren...",Successful,108,27.0,25.0,2021-01-10,16,"Animal Welfare,Recycling,French manufacturing",gif,8,5.0,710.0,1,2,19,https://www.kisskissbankbank.com/en/users/phil...,fr,"bac, litière, connecté,, esthétique, performan..."
9,9,https://www.kisskissbankbank.com/en/projects/n...,"NOMANK, les super-tisanes bio 100% françaises !",Successful,110,549.0,500.0,2021-01-10,122,"Organic,French manufacturing",gif,6,18.0,70.0,4,16,129,https://www.kisskissbankbank.com/en/users/mano...,fr,"soutenez-nous, remettons, ensemble, goût, jour..."
