## Vectorizing tabular fields

In [1]:
import pandas as pd
from pathlib import Path
import sys
sys.path.append("..")
import warnings
warnings.filterwarnings("ignore")

from ml_editor.data_processing import get_normalized_series


data_path=Path("D:\Project 1\data\writers.csv")
df=pd.read_csv(data_path)

In [2]:
df['is_question']=df["PostTypeId"] == 1

tabular_df=df[df["is_question"]][["Tags", "CommentCount", "CreationDate", "Score"]]
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,32
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,20
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,34
3,<plot><short-story><planning><brainstorming>,0,2010-11-18T20:43:59.693,28
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,21


In [3]:
tabular_df["NormComment"]=get_normalized_series(tabular_df, "CommentCount")
tabular_df["NormScore"]=get_normalized_series(tabular_df, "Score")
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score,NormComment,NormScore
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,32,1.405553,3.66092
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,20,-0.878113,2.02388
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,34,-0.551875,3.93376
3,<plot><short-story><planning><brainstorming>,0,2010-11-18T20:43:59.693,28,-0.878113,3.11524
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,21,-0.551875,2.1603


In [4]:
# Create our date to a pandas datetime
tabular_df["date"]=pd.to_datetime(tabular_df["CreationDate"])

# Extract meaningful features from the datetime object
tabular_df["year"]=tabular_df["date"].dt.year
tabular_df["month"]=tabular_df["date"].dt.month
tabular_df["day"]=tabular_df["date"].dt.day
tabular_df["hour"]=tabular_df["date"].dt.hour


tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score,NormComment,NormScore,date,year,month,day,hour
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,32,1.405553,3.66092,2010-11-18 20:40:32.857,2010,11,18,20
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,20,-0.878113,2.02388,2010-11-18 20:42:31.513,2010,11,18,20
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,34,-0.551875,3.93376,2010-11-18 20:43:28.903,2010,11,18,20
3,<plot><short-story><planning><brainstorming>,0,2010-11-18T20:43:59.693,28,-0.878113,3.11524,2010-11-18 20:43:59.693,2010,11,18,20
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,21,-0.551875,2.1603,2010-11-18 20:45:44.067,2010,11,18,20


In [6]:
tags = tabular_df["Tags"]
clean_tags = tags.str.split("><").apply(lambda x: [a.strip("<").strip(">") for a in x])
tag_columns = clean_tags.apply(pd.Series).stack().reset_index(level=1, drop=True)
tag_dummies = pd.get_dummies(tag_columns)
tag_counts = tag_dummies.sum()
top_tags = tag_counts[tag_counts > 500]
top_tag_columns = tag_dummies[top_tags.index]

In [7]:
top_tag_columns.head()

Unnamed: 0,characters,creative-writing,fiction,novel,publishing,style,technique
0,False,False,False,False,False,False,False
0,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False


In [9]:
# Add our tags back into our initial DataFrame
final = pd.concat([tabular_df.reset_index(drop=True), top_tag_columns.reset_index(drop=True)], axis=1)


# Keeping only the vectorized features
col_to_keep = ["year", "month", "day", "hour", "NormComment",
               "NormScore"] + list(top_tags.index)
final_features = final[col_to_keep]

In [10]:
final_features.head()

Unnamed: 0,year,month,day,hour,NormComment,NormScore,characters,creative-writing,fiction,novel,publishing,style,technique
0,2010.0,11.0,18.0,20.0,1.405553,3.66092,False,False,False,False,False,False,False
1,2010.0,11.0,18.0,20.0,-0.878113,2.02388,False,False,False,False,False,False,False
2,2010.0,11.0,18.0,20.0,-0.551875,3.93376,False,False,True,False,False,False,False
3,2010.0,11.0,18.0,20.0,-0.878113,3.11524,False,False,False,False,False,False,False
4,2010.0,11.0,18.0,20.0,-0.551875,2.1603,False,False,False,False,False,False,False
