In [1]:
import pandas as pd
import json
# preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import string
# model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import sklearn.metrics as metrics

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("classification_data_original.csv")

In [4]:
mydf = df[['index', 'id', 'content', 'title', 'preprocessed_content', 'label', 'hierarchy']]

In [5]:
mydf.head()

Unnamed: 0,index,id,content,title,preprocessed_content,label,hierarchy
0,133,https://www.whowhatwear.com/5-looks-that-put-b...,After seeing burgundy boots on some of our fav...,5 Looks That Put Burgundy Boots On Our Must-Ha...,seeing burgundy boot favorite street style blo...,Womens\ Casual\ Wear,"[{'Style & Fashion': {""Women's Fashion"": {""Wom..."
1,189,https://www.wisebread.com/what-you-need-to-kno...,"According to a September 29, 2009 Fidelity Inv...",How to Make the Most of Your 401K,according september fidelity investments study...,Personal\ Investing,[{'Personal Finance': 'Personal Investing'}]
2,401,https://www.theturekclinic.com/blog/mindful-ea...,"“Thou shouldst eat to live; not live to eat,” ...",Mindful Eating,thou shouldst eat live live eat said socrates ...,Mens\ Health,"[{'Healthy Living': ""Men's Health""}]"
3,198,https://www.news-medical.net/news/20190611/Ear...,Researchers continue to dig for molecular clue...,Early life adversity and high levels of FKBP5 ...,researchers continue dig molecular clue better...,Hormonal\ Disorders,[{'Medical Health': {'Diseases and Conditions'...
4,24,https://hobbylark.com/card-games/Top-Strongest...,"As any duelist knows, the vast majority of Yu-...",Top 10 Strongest (Highest ATK) Monsters in Yu-...,duelist know vast majority yu gi oh match end ...,Card\ Games,[{'Hobbies & Interests': {'Games and Puzzles':...


# Separate hierarchically structured target variable into multiple columns

In [6]:
# Put hierarchical categories into seperate columns
categories = mydf.hierarchy.str.split(':', 4, expand=True)

In [7]:
# Remove all special characters 
cat = categories.replace(r'[^A-Za-z0-9]', r' ', regex=True)

In [8]:
# Now we have more clear categories
cat.head()

Unnamed: 0,0,1,2,3
0,Style Fashion,Women s Fashion,Women s Clothing,Women s Casual Wear
1,Personal Finance,Personal Investing,,
2,Healthy Living,Men s Health,,
3,Medical Health,Diseases and Conditions,Endocrine and Metabolic Diseases,Hormonal Disorders
4,Hobbies Interests,Games and Puzzles,Card Games,


In [9]:
newdf = pd.concat([mydf, cat], axis = 1)
newdf.head()

Unnamed: 0,index,id,content,title,preprocessed_content,label,hierarchy,0,1,2,3
0,133,https://www.whowhatwear.com/5-looks-that-put-b...,After seeing burgundy boots on some of our fav...,5 Looks That Put Burgundy Boots On Our Must-Ha...,seeing burgundy boot favorite street style blo...,Womens\ Casual\ Wear,"[{'Style & Fashion': {""Women's Fashion"": {""Wom...",Style Fashion,Women s Fashion,Women s Clothing,Women s Casual Wear
1,189,https://www.wisebread.com/what-you-need-to-kno...,"According to a September 29, 2009 Fidelity Inv...",How to Make the Most of Your 401K,according september fidelity investments study...,Personal\ Investing,[{'Personal Finance': 'Personal Investing'}],Personal Finance,Personal Investing,,
2,401,https://www.theturekclinic.com/blog/mindful-ea...,"“Thou shouldst eat to live; not live to eat,” ...",Mindful Eating,thou shouldst eat live live eat said socrates ...,Mens\ Health,"[{'Healthy Living': ""Men's Health""}]",Healthy Living,Men s Health,,
3,198,https://www.news-medical.net/news/20190611/Ear...,Researchers continue to dig for molecular clue...,Early life adversity and high levels of FKBP5 ...,researchers continue dig molecular clue better...,Hormonal\ Disorders,[{'Medical Health': {'Diseases and Conditions'...,Medical Health,Diseases and Conditions,Endocrine and Metabolic Diseases,Hormonal Disorders
4,24,https://hobbylark.com/card-games/Top-Strongest...,"As any duelist knows, the vast majority of Yu-...",Top 10 Strongest (Highest ATK) Monsters in Yu-...,duelist know vast majority yu gi oh match end ...,Card\ Games,[{'Hobbies & Interests': {'Games and Puzzles':...,Hobbies Interests,Games and Puzzles,Card Games,


In [10]:
import nltk

In [11]:
def preprocess_text(text):
    lemma = WordNetLemmatizer()
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [lemma.lemmatize(word).lower() for word in tokens if word.lower() not in stop_words and not word.isdigit()]
    return re.sub("\s\s+" , " ", ' '.join(words))

In [12]:
newdf['content'] = newdf['content'].astype(str)

In [13]:
newdf['cleaned_content'] = newdf['content'].apply(preprocess_text)

In [14]:
newdf.columns = ['index','id','content','title', 'preprocessed_content','label','hierarchy', 'first','second','third','fourth', 'cleaned_content']

In [15]:
newdf.head()

Unnamed: 0,index,id,content,title,preprocessed_content,label,hierarchy,first,second,third,fourth,cleaned_content
0,133,https://www.whowhatwear.com/5-looks-that-put-b...,After seeing burgundy boots on some of our fav...,5 Looks That Put Burgundy Boots On Our Must-Ha...,seeing burgundy boot favorite street style blo...,Womens\ Casual\ Wear,"[{'Style & Fashion': {""Women's Fashion"": {""Wom...",Style Fashion,Women s Fashion,Women s Clothing,Women s Casual Wear,seeing burgundy boot favorite street style blo...
1,189,https://www.wisebread.com/what-you-need-to-kno...,"According to a September 29, 2009 Fidelity Inv...",How to Make the Most of Your 401K,according september fidelity investments study...,Personal\ Investing,[{'Personal Finance': 'Personal Investing'}],Personal Finance,Personal Investing,,,according september fidelity investments study...
2,401,https://www.theturekclinic.com/blog/mindful-ea...,"“Thou shouldst eat to live; not live to eat,” ...",Mindful Eating,thou shouldst eat live live eat said socrates ...,Mens\ Health,"[{'Healthy Living': ""Men's Health""}]",Healthy Living,Men s Health,,,thou shouldst eat live live eat said socrates ...
3,198,https://www.news-medical.net/news/20190611/Ear...,Researchers continue to dig for molecular clue...,Early life adversity and high levels of FKBP5 ...,researchers continue dig molecular clue better...,Hormonal\ Disorders,[{'Medical Health': {'Diseases and Conditions'...,Medical Health,Diseases and Conditions,Endocrine and Metabolic Diseases,Hormonal Disorders,researchers continue dig molecular clue better...
4,24,https://hobbylark.com/card-games/Top-Strongest...,"As any duelist knows, the vast majority of Yu-...",Top 10 Strongest (Highest ATK) Monsters in Yu-...,duelist know vast majority yu gi oh match end ...,Card\ Games,[{'Hobbies & Interests': {'Games and Puzzles':...,Hobbies Interests,Games and Puzzles,Card Games,,duelist know vast majority yu gi oh match end ...


column: depth : the total number of levels --> and then predict? 
try to see if it's accurate

if it's accurate,

In [16]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9192 entries, 0 to 9191
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   index                 9191 non-null   object
 1   id                    9192 non-null   object
 2   content               9192 non-null   object
 3   title                 8949 non-null   object
 4   preprocessed_content  8995 non-null   object
 5   label                 9170 non-null   object
 6   hierarchy             9169 non-null   object
 7   first                 9169 non-null   object
 8   second                8653 non-null   object
 9   third                 4067 non-null   object
 10  fourth                1438 non-null   object
 11  cleaned_content       9192 non-null   object
dtypes: object(12)
memory usage: 861.9+ KB


In [17]:
first_level = newdf[['cleaned_content', 'first']]

In [18]:
first_level

Unnamed: 0,cleaned_content,first
0,seeing burgundy boot favorite street style blo...,Style Fashion
1,according september fidelity investments study...,Personal Finance
2,thou shouldst eat live live eat said socrates ...,Healthy Living
3,researchers continue dig molecular clue better...,Medical Health
4,duelist know vast majority yu gi oh match end ...,Hobbies Interests
...,...,...
9187,might get loud productions announces first key...,Hobbies Interests
9188,tonight year end fundraiser free clinic workin...,Healthy Living
9189,modern mobile web development unlimited number...,Technology Computing
9190,two new book spring mystery writer come togeth...,Books and Literature


In [19]:
# Drop NaNs of first level 
first_level.dropna(inplace=True)

In [20]:
#TF-IDF(Term Frequency – Inverse Document Frequency) 인코딩
#단어를 갯수 그대로 카운트하지 않고 모든 문서에 공통적으로 들어있는 단어의 경우 문서 구별 능력이 떨어진다고 보아 가중치를 축소하는 방법이다.
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(first_level['cleaned_content']).toarray()
labels = first_level['first']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("TF-IDF_Accuracy",metrics.accuracy_score(y_test, y_pred))
print("TF-IDF_Precision",metrics.precision_score(y_test, y_pred,average='weighted'))
print("TF-IDF_Recall",metrics.recall_score(y_test, y_pred,average='weighted'))
print("TF-IDF_F1-score",metrics.f1_score(y_test, y_pred,average='weighted'))

TF-IDF_Accuracy 0.8826834104428288
TF-IDF_Precision 0.8836265261108233
TF-IDF_Recall 0.8826834104428288
TF-IDF_F1-score 0.8766261559340929


# Data for Two Levels


In [21]:
two_levels = newdf[['cleaned_content', 'first', 'second']]

In [22]:
two_levels.head()

Unnamed: 0,cleaned_content,first,second
0,seeing burgundy boot favorite street style blo...,Style Fashion,Women s Fashion
1,according september fidelity investments study...,Personal Finance,Personal Investing
2,thou shouldst eat live live eat said socrates ...,Healthy Living,Men s Health
3,researchers continue dig molecular clue better...,Medical Health,Diseases and Conditions
4,duelist know vast majority yu gi oh match end ...,Hobbies Interests,Games and Puzzles


In [23]:
two_levels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9192 entries, 0 to 9191
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cleaned_content  9192 non-null   object
 1   first            9169 non-null   object
 2   second           8653 non-null   object
dtypes: object(3)
memory usage: 215.6+ KB


## Make NA as 'Unknown'

In [24]:
import numpy as np
two_levels2 = two_levels.replace(np.nan, 'Unknown', regex=True)

In [25]:
two_levels2[two_levels2['first'] == "Unknown"]

Unnamed: 0,cleaned_content,first,second
761,pete townshend quiet unassuming 18th century h...,Unknown,Unknown
1732,decade covering swiss luxury watch industry sa...,Unknown,Unknown
2484,week go inbox filled small barrage announcemen...,Unknown,Unknown
3190,reilly media co founder dale dougherty coined ...,Unknown,Unknown
3753,fifteen years david bowie ring master rock sty...,Unknown,Unknown
4088,got get velcro bo diddley advises ask phone ce...,Unknown,Unknown
4397,steven ballmer year old executive vice preside...,Unknown,Unknown
4411,kevin carey essay huffington post last week cr...,Unknown,Unknown
4684,first thing learn rocklahoma glam metal band g...,Unknown,Unknown
4722,hong kong august 16th young reporter hong kong...,Unknown,Unknown


In [26]:
two_levels2[two_levels2['second'] == "Unknown"]

Unnamed: 0,cleaned_content,first,second
11,shoulder look fun game mid red carpet readjust...,Style Fashion,Unknown
24,small cat uk barely survived attacked fox larg...,Pets,Unknown
28,saw claire danes new cover interview magazine ...,Style Fashion,Unknown
41,shopping wedding gown one fun exciting part pl...,Events and Attractions,Unknown
47,read detailed description ancestry com came ex...,Shopping,Unknown
...,...,...,...
9066,check store offer health beauty coupon deal re...,Shopping,Unknown
9075,sep joe kleiman attractions business events ne...,Events and Attractions,Unknown
9136,rewards day annual shopback event gather digit...,Shopping,Unknown
9148,tis season sun salt sweat pleasant sound well ...,Style Fashion,Unknown


In [27]:
two_levels2.info() # No NAs by replacing them to "Unknown"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9192 entries, 0 to 9191
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cleaned_content  9192 non-null   object
 1   first            9192 non-null   object
 2   second           9192 non-null   object
dtypes: object(3)
memory usage: 215.6+ KB


### 1) the first level prediction

In [28]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(two_levels2['cleaned_content']).toarray()
labels = two_levels2['first']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [29]:
y_pred

array(['   Pop Culture ', '   Business and Finance ',
       '   Medical Health ', ..., '   Travel ', '   Hobbies   Interests ',
       '   Books and Literature '], dtype=object)

In [30]:
y_test

8085                  Pop Culture 
8942       Events and Attractions 
3476               Medical Health 
9178                    Education 
1172               Healthy Living 
                   ...            
4773               Medical Health 
235           Hobbies   Interests 
5396                       Travel 
2909          Hobbies   Interests 
2340         Books and Literature 
Name: first, Length: 3034, dtype: object

In [31]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
y_train

5025                       Sports 
1108               Healthy Living 
1232          Hobbies   Interests 
8755         Business and Finance 
2333             Personal Finance 
                   ...            
4373         Business and Finance 
7891       Technology   Computing 
4859         Business and Finance 
3264                      Science 
2732                       Movies 
Name: first, Length: 6158, dtype: object

In [34]:
two_levels2.head()

Unnamed: 0,cleaned_content,first,second
0,seeing burgundy boot favorite street style blo...,Style Fashion,Women s Fashion
1,according september fidelity investments study...,Personal Finance,Personal Investing
2,thou shouldst eat live live eat said socrates ...,Healthy Living,Men s Health
3,researchers continue dig molecular clue better...,Medical Health,Diseases and Conditions
4,duelist know vast majority yu gi oh match end ...,Hobbies Interests,Games and Puzzles


In [35]:
from sklearn.model_selection import train_test_split

cleaned_content_train, cleaned_content_test = train_test_split(two_levels2['cleaned_content'], test_size=0.33)

In [36]:
cleaned_content_train

2056    dublin march prnewswire population health mana...
8629    epa approves new fungicide delivered honey bee...
8504    amber rolfeso job interview cue sigh relief wh...
2964    los angeles extended stay hotel portfolio choi...
7213    dean elgar half joking pretended take armband ...
                              ...                        
46      end soap opera fora marine shipyard gone recei...
1705    carly broder san francisco gave birth son isai...
303     chiasma inc privately held biopharma company a...
5324    paulo dec prnewswire rebel digital consumer fi...
7710    country afford entirely stop air travel carbon...
Name: cleaned_content, Length: 6158, dtype: object

In [37]:
cleaned_content_test

8068    magic aura enchantment artifact equipment desi...
6321    kelly clarksonhow exerciselost around lb novem...
21      serverless computing function service faas inc...
7498    trading thursday apparel store share relative ...
5932    michael chearyyou take trip russia hit goal su...
                              ...                        
2079    men thrift store different kind shopping watch...
1719    former megadeth drummer nick menza mom calling...
8601    new book dessert two countryman press feb chri...
5422    roland professional v audinate creator dante n...
5420    post contains reference product advertiser may...
Name: cleaned_content, Length: 3034, dtype: object

# Start with making dataframe, including the second level

In [87]:
# a : with real first level
a = pd.DataFrame(list(zip(cleaned_content_train, two_levels2["first"], two_levels2['second'])))

In [88]:
# b : with predicted first level
b = pd.DataFrame(list(zip(cleaned_content_test, y_pred, two_levels2['second'])))

In [89]:
a.columns = ['content', 'real_first', 'second']
b.columns = ['content', 'predicted_first', 'second']

In [90]:
a.head()

Unnamed: 0,content,real_first,second
0,dublin march prnewswire population health mana...,Style Fashion,Women s Fashion
1,epa approves new fungicide delivered honey bee...,Personal Finance,Personal Investing
2,amber rolfeso job interview cue sigh relief wh...,Healthy Living,Men s Health
3,los angeles extended stay hotel portfolio choi...,Medical Health,Diseases and Conditions
4,dean elgar half joking pretended take armband ...,Hobbies Interests,Games and Puzzles


In [91]:
b.head()

Unnamed: 0,content,predicted_first,second
0,magic aura enchantment artifact equipment desi...,Pop Culture,Women s Fashion
1,kelly clarksonhow exerciselost around lb novem...,Business and Finance,Personal Investing
2,serverless computing function service faas inc...,Medical Health,Men s Health
3,trading thursday apparel store share relative ...,Education,Diseases and Conditions
4,michael chearyyou take trip russia hit goal su...,Healthy Living,Games and Puzzles


### make train_features by combining the content and the real first level as sparse matrix

In [85]:
train_features2_1 = tfidf.fit_transform(a.content)
train_features2_2 = tfidf.fit_transform(a.real_first)

In [76]:
train_features2_1

<6158x48495 sparse matrix of type '<class 'numpy.float64'>'
	with 1452331 stored elements in Compressed Sparse Row format>

In [77]:
train_features2_2

<6158x57 sparse matrix of type '<class 'numpy.float64'>'
	with 14350 stored elements in Compressed Sparse Row format>

In [81]:
# hstack to avoid .stack() in tfidf
from scipy.sparse import hstack

train_features_content_realfirst = hstack([train_features2_1,train_features2_2])

In [82]:
train_features_content_realfirst

<6158x48552 sparse matrix of type '<class 'numpy.float64'>'
	with 1466681 stored elements in COOrdinate format>

### make test_features by combining the content and the real first level as sparse matrix

In [92]:
test_features2_1 = tfidf.fit_transform(b.content)
test_features2_2 = tfidf.fit_transform(b.predicted_first)
test_features_content_predictedfirst = hstack([test_features2_1,test_features2_2])

In [93]:
test_features_content_predictedfirst 

<3034x23566 sparse matrix of type '<class 'numpy.float64'>'
	with 646142 stored elements in COOrdinate format>

### make target for train and test

In [94]:
train_target = a.iloc[:, -1] # target variable for train_features # with real first level
train_target

0                  Women s Fashion 
1             Personal Investing   
2                   Men s Health   
3          Diseases and Conditions 
4                Games and Puzzles 
                   ...             
6153                 Personal Debt 
6154                        Unknown
6155                        Unknown
6156                Martial Arts   
6157                     Computing 
Name: second, Length: 6158, dtype: object

In [95]:
test_target = b.iloc[:, -1] # target variable for test_features
test_target

0                  Women s Fashion 
1             Personal Investing   
2                   Men s Health   
3          Diseases and Conditions 
4                Games and Puzzles 
                   ...             
3029                        Unknown
3030            Personal Investing 
3031                     Computing 
3032                Oldies Music   
3033                   Parenting   
Name: second, Length: 3034, dtype: object

## X_train, X_test, y_train, y_test == train_features, test_features, train_target, test_target
- train_features = content & real first level labels
- train_target = real second level labels
- test_features = Based on this variable, we want to predict the second level. It's including the content, and the predicted first level lables by the first model.

### We hope to see if we can predict the second level, after predicting the first level. 


# Q)

In [96]:
model = LinearSVC()
model.fit(train_features_content_realfirst, train_target)
prediction = model.predict(test_features_content_predictedfirst)

ValueError: X has 23566 features per sample; expecting 48552

In [None]:
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# print("TF-IDF_Accuracy",metrics.accuracy_score(y_test, y_pred))
# print("TF-IDF_Precision",metrics.precision_score(y_test, y_pred,average='weighted'))
# print("TF-IDF_Recall",metrics.recall_score(y_test, y_pred,average='weighted'))
# print("TF-IDF_F1-score",metrics.f1_score(y_test, y_pred,average='weighted'))

https://datascience.stackexchange.com/questions/22813/using-tf-idf-with-other-features-in-scikit-learn

1) hstack
2) build pipeline

In [None]:
# tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
# features = tfidf.fit_transform(two_levels2['cleaned_content']).toarray()
# labels = two_levels2['first']
# X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
# model = LinearSVC()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# Q) How can we use two X features and split into X_train, X_test, y_train, y_test...?

In [66]:
X_train, y_train = train_test_split(train_features, second_labels , test_size=0.33)


ValueError: Found input variables with inconsistent numbers of samples: [12316, 9192]

In [None]:
X_train, y_train = train_test_split(train_features, second_labels , test_size=0.33)
X_test, y_test = train_test_split(test_features, second_labels , test_size=0.33)

In [None]:
# tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
# features = tfidf.fit_transform(two_levels2['cleaned_content']).toarray()
# labels = two_levels2['first']
# X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)
# model = LinearSVC()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

In [None]:
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words='english')
train_features = tfidf.fit_transform(first_level['cleaned_content_train', "first_level"]).toarray()
test_features = tfidf.fit_transform(first_level['cleaned_content_test', y_pred]).toarray()

labels = first_level['second']

#X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=0)

# do not do the train_test_split above,

# With train_features, you need to make two things: X_train, y_train
# With test_feautures, you need to make two things: X_test, y_test 
# --> This is because during test, it should not include first level information!! you only have content!

# and then do the followings.
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)






model2 = model2(Feature, L2)

or

model2 = model2(Feature + y_pred of Level1, level2)

*** Feature+y_pred of Level1 --> this should be training data


In [51]:
len(features)

9169

## Q) but how can I deal with all the Nans in 2,3,4th levels?

# Q) How do I make it for hierarchical classifications?

# Q) Error in Pre-process 'content' using spaCy

In [10]:
#pip install -U spacy

In [11]:
import spacy

In [None]:
#conda install -c conda-forge spacy

In [18]:
nlp = spacy.load("en")

# English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [15]:
python -m spacy download en

SyntaxError: invalid syntax (<ipython-input-15-fc4d5d118d23>, line 1)

In [16]:
nlp = spacy.load('en_core_web_sm') 

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [17]:
python -m spacy download en_core_web_sm

SyntaxError: invalid syntax (<ipython-input-17-e8a31c0c54ec>, line 1)

In [19]:
import en_core_web_sm

nlp = en_core_web_sm.load()

ModuleNotFoundError: No module named 'en_core_web_sm'

In [None]:

def cleaning(content):
    removal = ['ADV', 'PRON', 'CCONJ', 'PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc = nlp(text)
    for tokens in doc:
        if tokens.is_stop == False and tokens.is_alpha and len(tokens)>2 and tokens.pos_ not in removal:
            lemma = tokens.lemma_
            text_out.append(lemma)
    return text_out       

In [None]:
preprocessed = newdf.content.apply(lambda x: cleaning(x))

is_alpha: Does the token consist of alphabetic characters?

pos_: Coarse-grained part-of-speech from the Universal POS tag set

lemma_ : to get root word as a dictionary form of a word, by removing inflectional endings only. This converts word contractions such as "can't" to "can" and "not"

References:
https://towardsdatascience.com/setting-up-text-preprocessing-pipeline-using-scikit-learn-and-spacy-e09b9b76758f
https://gist.github.com/smsubrahmannian/2835bd32c688b7b57a5300f94af07b1b

- the Universal POS tag set

ADJ: adjective
ADP: adposition
ADV: adverb
AUX: auxiliary
CCONJ: coordinating conjunction
DET: determiner
INTJ: interjection
NOUN: noun
NUM: numeral
PART: particle
PRON: pronoun
PROPN: proper noun
PUNCT: punctuation
SCONJ: subordinating conjunction
SYM: symbol
VERB: verb
X: other


In [None]:
#mydf.hierarchy = mydf.hierarchy.str.replace("\", "")
#from pandas.io.json import json_normalize
#lol = json_normalize(data = mydf, record_path ='hierarchy', meta =['first', 'second', 'third', 'fourth'])