In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim import corpora, models, similarities, downloader
from sklearn.decomposition import PCA
from matplotlib import pyplot

# text processing with sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, Binarizer


In [3]:
df = pd.read_csv('data_clean.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177522 entries, 0 to 177521
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   blurb              177514 non-null  object 
 1   country            177522 non-null  object 
 2   goal               177522 non-null  float64
 3   name               177522 non-null  object 
 4   state              177522 non-null  object 
 5   main_category      177522 non-null  object 
 6   sub_category       177522 non-null  object 
 7   location_type      177522 non-null  object 
 8   duration           177522 non-null  int64  
 9   deadline_month     177522 non-null  int64  
 10  deadline_day       177522 non-null  int64  
 11  launched_at_month  177522 non-null  int64  
 12  launched_at_day    177522 non-null  int64  
 13  target             177522 non-null  int64  
 14  baseline           177522 non-null  int64  
dtypes: float64(1), int64(7), object(7)
memory usage: 20

In [5]:
def tokenizer(col):
    corpus = df[col]
    # CountVectorizer with ngram_range and token_pattern to avoind numbers, single letters and special characters
    model = CountVectorizer(ngram_range=(1, 2), max_features=100, token_pattern=r'\b[a-zA-Z]{3,}\b', stop_words='english')
    matrix = model.fit_transform(corpus).toarray()
    df_output = pd.DataFrame(data = matrix, columns = model.get_feature_names_out())
    df_output.T.head(10)

    print(df_output.sum().sort_values(ascending=False).head(10))

    return df_output


In [6]:
df['blurb'].fillna('', inplace=True)

In [17]:
tokenizer('blurb')

help     18741
new      16787
world    10419
music     9865
book      9573
art       9384
album     9266
make      8528
film      7575
life      7522
dtype: int64


Unnamed: 0,adventure,album,app,art,artist,artists,band,based,beautiful,best,...,using,video,want,way,women,work,world,year,years,young
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177517,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
177518,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
177519,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
177520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
def tokenizer_tfdif(col):
    corpus = df[col]
    model = TfidfVectorizer(ngram_range = (3, 7), max_features = 100, stop_words='english')
    matrix = model.fit_transform(corpus).toarray()
    df_output = pd.DataFrame(data = matrix, columns = model.get_feature_names_out())

    print(df_output.sum().sort_values(ascending=False).head(10))

    return df_output

In [15]:
df_tokenizer_blurb = tokenizer_tfdif('blurb')
# concatenate the two dataframes


hard enamel pins      387.607912
new york city         332.778335
coffee table book     217.385861
help make happen      192.559075
deck playing cards    185.451126
album need help       174.175809
need help make        166.671949
hard enamel pin       138.201218
trying raise money    129.408186
help raise funds      109.456541
dtype: float64


In [16]:

df = pd.concat([df, df_tokenizer_blurb], axis=1)

In [17]:
df.to_csv('data_text_processed_ngram_2_7.csv', index=False)

In [25]:
df.drop(['blurb','name'], axis=1, inplace=True)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177522 entries, 0 to 177521
Columns: 112 entries, country to young
dtypes: float64(101), int64(6), object(5)
memory usage: 151.7+ MB


In [27]:
df.head()

Unnamed: 0,country,goal,state,main_category,sub_category,location_type,duration,deadline_month,deadline_day,launched_at_month,...,using,video,want,way,women,work,world,year,years,young
0,US,1000.0,successful,games,playing cards,Town,30,9,9,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,US,15000.0,successful,music,rock,Suburb,30,6,12,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,GB,10000.0,failed,games,playing cards,Town,59,3,13,1,...,0.527432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,US,2800.0,successful,publishing,nonfiction,Town,30,1,9,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,US,3500.0,successful,music,classical music,Town,30,5,2,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
