Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
data_df = pd.read_csv('fashionblogs1.csv',index_col=0)

In [3]:
data_df

Unnamed: 0,fashiontext
fashionspotcontent,"""There are certain staples every wardrobe need..."
justthedesign,Our Reasons To Give Into The Embroidery Trend ...
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...
refinery29,Velvet material was seen all over the fall run...
vogueIndia,5 designer labels you will always find in alia...
voguefashiontext,//www.vogue.co.uk//fashion/article/guide-to-b...


Analysis: To get quick overview of Data

In [91]:
data_df['char_count'] = data_df['fashiontext'].str.len() ## this also includes spaces
data_df[['fashiontext','char_count']].head()

Unnamed: 0,fashiontext,char_count
fashionspotcontent,"""There are certain staples every wardrobe need...",1989129
justthedesign,Our Reasons To Give Into The Embroidery Trend ...,6289320
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...,604003
refinery29,Velvet material was seen all over the fall run...,1999745
vogueIndia,5 designer labels you will always find in alia...,6691634


To calculate average word length

In [93]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

data_df['avg_word'] = data_df['fashiontext'].apply(lambda x: avg_word(x))
data_df[['fashiontext','avg_word']].head()

Unnamed: 0,fashiontext,avg_word
fashionspotcontent,"""There are certain staples every wardrobe need...",5.936963
justthedesign,Our Reasons To Give Into The Embroidery Trend ...,4.988989
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...,5.025445
refinery29,Velvet material was seen all over the fall run...,5.429397
vogueIndia,5 designer labels you will always find in alia...,4.941884


Calculating number of predefined stopwords in text

In [94]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

data_df['stopwords'] = data_df['fashiontext'].apply(lambda x: len([x for x in x.split() if x in stop]))
data_df[['fashiontext','stopwords']].head()

Unnamed: 0,fashiontext,stopwords
fashionspotcontent,"""There are certain staples every wardrobe need...",78521
justthedesign,Our Reasons To Give Into The Embroidery Trend ...,386838
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...,35925
refinery29,Velvet material was seen all over the fall run...,87841
vogueIndia,5 designer labels you will always find in alia...,397249


Calculating number of digits in text data

In [95]:
data_df['numerics'] = data_df['fashiontext'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data_df[['fashiontext','numerics']].head()

Unnamed: 0,fashiontext,numerics
fashionspotcontent,"""There are certain staples every wardrobe need...",1847
justthedesign,Our Reasons To Give Into The Embroidery Trend ...,5823
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...,411
refinery29,Velvet material was seen all over the fall run...,228
vogueIndia,5 designer labels you will always find in alia...,6848


Calculating number of Lower case words

In [5]:
data_df['lower'] = data_df['fashiontext'].apply(lambda x: len([x for x in x.split() if x.islower()]))
data_df[['fashiontext','lower']].head()

Unnamed: 0,fashiontext,lower
fashionspotcontent,"""There are certain staples every wardrobe need...",189867
justthedesign,Our Reasons To Give Into The Embroidery Trend ...,866104
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...,81235
refinery29,Velvet material was seen all over the fall run...,218814
vogueIndia,5 designer labels you will always find in alia...,901664


In [6]:
Calculating number of Upper case words

SyntaxError: invalid syntax (<ipython-input-6-d204c6f85b4b>, line 1)

In [97]:
data_df['upper'] = data_df['fashiontext'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data_df[['fashiontext','upper']].head()

Unnamed: 0,fashiontext,upper
fashionspotcontent,"""There are certain staples every wardrobe need...",1588
justthedesign,Our Reasons To Give Into The Embroidery Trend ...,16141
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...,1837
refinery29,Velvet material was seen all over the fall run...,3049
vogueIndia,5 designer labels you will always find in alia...,11441


### Data Cleaning - Removing Punctuations, Links, numbers, non-english words 

In [103]:
import re
import string
def remove_links(webtext):
    
    tweet = re.sub(r'https:\S+', '', webtext) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', webtext) # rempve bitly links
    tweet = re.sub(r'//www.\S+', '', webtext)
    tweet = webtext.strip('[link]') # remove [links]
    return tweet
round0 = lambda x: remove_links(x)

In [104]:
data_clean = pd.DataFrame(data_df.fashiontext.apply(round0))
data_clean

Unnamed: 0,fashiontext
fashionspotcontent,"""There are certain staples every wardrobe need..."
justthedesign,Our Reasons To Give Into The Embroidery Trend ...
mariecuriefashiontext,https://www.marieclaire.co.uk/fashion/build-a-...
refinery29,Velvet material was seen all over the fall run...
vogueIndia,5 designer labels you will always find in alia...
voguefashiontext,//www.vogue.co.uk//fashion/article/guide-to-b...


In [105]:
def clean_text_1(text):
    
    text = text.lower()
    text = re.sub('\xa0', ' ', text)
    text = re.sub('\[.*?\]', '', text)
    
    text = re.sub('\s+', ' ', text) #remove double spacing
    text = re.sub('([0-9]+)', '', text) # remove numbers
    text = re.sub("[^0-9A-Za-z///]", ' ', text) ##remove non english word
    
    return text

round1 = lambda x: clean_text_1(x)

In [106]:
data_clean = pd.DataFrame(data_clean.fashiontext.apply(round1))

In [115]:
# Apply a second round of cleaning
def clean_text_2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('xa', ' ', text)
    text = re.sub('https', ' ', text)
    text = re.sub('www', ' ', text)
    text = re.sub(r'[^\w]', ' ', text)
    text = re.sub('contactus', ' ', text)
    text = re.sub(r'\b\w{12,}\b', '', text) #keeping word with length less than 12
    
    return text

round2 = lambda x: clean_text_2(x)

In [116]:
data_clean = pd.DataFrame(data_clean.fashiontext.apply(round2))
data_clean

Unnamed: 0,fashiontext
fashionspotcontent,certain staples every wardrobe needs especiall...
justthedesign,reasons give embroidery trend wear itby margar...
mariecuriefashiontext,marieclaire co sustainable wardrobe words ...
refinery29,velvet material seen fall runways around luxur...
vogueIndia,designer labels find alia bhatt wardrobe viral...
voguefashiontext,vogue co buying watch watcheshow buy watch v...


### Stopword removal

In [117]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
data_clean['fashiontext'] = data_clean['fashiontext'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


### Adding most occuring words to stopword list

In [118]:
freq = pd.Series(' '.join(data_clean['fashiontext']).split()).value_counts()[:25]

In [119]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

stop_words.extend(['like','etc','often','made',"oh'",'also','still', 'look', 'one', 'new', 'know','kapoor','thats','two','youre','one','look','like','always','really','well','want','week','dont','back','says','day','something','go','take','even','get','us','made','year','way','make','show','talk','also','time','collection','one','way'])


In [120]:
data_clean['fashiontext'] = data_clean['fashiontext'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))


### Removing words occuring rarely

In [121]:
freq = pd.Series(' '.join(data_clean['fashiontext']).split()).value_counts()[-100:]
freq

cherubic      1
catalytic     1
clinks        1
benjikahnx    1
stuffs        1
             ..
aza           1
poetess       1
kunzites      1
postponing    1
desir         1
Length: 100, dtype: int64

In [123]:
freq = list(freq.index)
data_clean['fashiontext'] = data_clean['fashiontext'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
data_clean['fashiontext'].head()

fashionspotcontent       certain staples every wardrobe needs especiall...
justthedesign            reasons give embroidery trend wear itby margar...
mariecuriefashiontext    marieclaire co sustainable wardrobe words rosa...
refinery29               velvet material seen fall runways around luxur...
vogueIndia               designer labels find alia bhatt wardrobe viral...
Name: fashiontext, dtype: object

### Stemming and lemmatization

In [124]:
from textblob import Word
from nltk.stem import SnowballStemmer
stemmer_english = SnowballStemmer('english')

In [125]:
data_clean['fashiontext']

fashionspotcontent       certain staples every wardrobe needs especiall...
justthedesign            reasons give embroidery trend wear itby margar...
mariecuriefashiontext    marieclaire co sustainable wardrobe words rosa...
refinery29               velvet material seen fall runways around luxur...
vogueIndia               designer labels find alia bhatt wardrobe viral...
voguefashiontext         vogue co buying watch watcheshow buy watch vog...
Name: fashiontext, dtype: object

In [126]:
data_clean['fashiontext']=data_clean['fashiontext'].apply(lambda x : filter(None,x.split(" ")))

In [127]:
data_clean['fashiontext']=data_clean['fashiontext'].apply(lambda x : [stemmer_english.stem(y) for y in x])

In [128]:
data_clean['fashiontext']=data_clean['fashiontext'].apply(lambda x : " ".join(x))

In [129]:
data_clean['fashiontext'] = data_clean['fashiontext'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))


In [130]:
data_clean['fashiontext']

fashionspotcontent       certain stapl everi wardrob need especi come s...
justthedesign            reason give embroideri trend wear itbi margare...
mariecuriefashiontext    marieclair co sustain wardrob word rosanna fal...
refinery29               velvet materi seen fall runway around luxuri c...
vogueIndia               design label find alia bhatt wardrob viral bha...
voguefashiontext         vogu co buy watch watcheshow buy watch vogu ul...
Name: fashiontext, dtype: object

In [131]:
freq = pd.Series(' '.join(data_clean['fashiontext']).split()).value_counts()[:25]
freq

fashion    18201
wear       12339
style      12158
dress      11004
design     10108
trend       6880
black       6717
brand       6330
bag         5854
work        5827
vogu        5713
jean        5453
white       5260
top         4990
pair        4768
look        4719
woman       4697
outfit      4484
shoe        4153
first       4112
piec        3998
co          3965
jacket      3869
come        3844
love        3842
dtype: int64

### Stopwords Adding Most repeated words 'Fashion', 'wear','style','design'

In [7]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

stop_words.extend(['like','look','one','new','time','also','first','show','make','way','year','people','made','work','us','get','even','brand','even','take','go','something','day','pair','says','dont','years','think','brands','week','world','best','would','want','really','well','always','see','every','looks','youre','last','trend','many','much','know','kapoor','thats','need','im','instagram','next','photos','perfect','right','theres','around','set','said','going','things','making','never','come','could','since','little','everything','looking','comes','help','different','find','lot','say','another','bhayani','photo','ever','shes','piece','together','shows','seen','home','line','keep','important','house','got','ahead','thing','yet','shared','create','whether','including','without','makes','cant','number','often','theyre','add','give','though','took','try','today','paired','put','already','ready','went','came','moment','started','place','use','doesnt','director','didnt','ive','isnt','times','told','change','become','everyone','youll','family','course','social','actually','might','wanted','khan','image','matching','taking','away','end','based','month','less','ways','especially','anything','known','behind','across','created','free','enough','via','actor','company','met','head','media','getting','though','saw','almost','using','site','picks','x','scroll','images','must','instead','read','second','felt','choice','quite','worked','space','nothing','bring','according','options','means','ones','school','later','choose','among','done','able','called','someone','youve','let','pick','tells','wasnt','ago','person','ensemble','whats','given','occasion','kareena','open','link','form','sonam','explains','version','heres','goes','taken','takes','simply','ahuja','definitely','fashion','wear','style','design','trend'])


In [133]:
data_clean['fashiontext'] = data_clean['fashiontext'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
data_clean

Unnamed: 0,fashiontext
fashionspotcontent,certain stapl everi wardrob especi summer talk...
justthedesign,reason embroideri itbi margaret wright recogni...
mariecuriefashiontext,marieclair co sustain wardrob word rosanna fal...
refinery29,velvet materi fall runway luxuri colour versat...
vogueIndia,label alia bhatt wardrob viral may statement i...
voguefashiontext,vogu co buy watch watcheshow buy watch vogu ul...


In [135]:
freq = pd.Series(' '.join(data_clean['fashiontext']).split()).value_counts()[-200:]
freq

buchenwald    1
larroud       1
schonfeld     1
hamersveld    1
vinoodhfor    1
             ..
bandiera      1
ugra          1
bano          1
cousu         1
catchup       1
Length: 200, dtype: int64

In [136]:
freq = list(freq.index)
data_clean['fashiontext'] = data_clean['fashiontext'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
data_clean['fashiontext'].head()

fashionspotcontent       certain stapl everi wardrob especi summer talk...
justthedesign            reason embroideri itbi margaret wright recogni...
mariecuriefashiontext    marieclair co sustain wardrob word rosanna fal...
refinery29               velvet materi fall runway luxuri colour versat...
vogueIndia               label alia bhatt wardrob viral may statement i...
Name: fashiontext, dtype: object

In [146]:
#data_df.to_pickle("webblogcorpus.pkl")

In [147]:
#data_clean.to_pickle("finalclean2withfashionweardress.pkl")

### Creating 'Document Term Matrix' using 'TF-IDF' and 'Countvectorizer' also known as BoW 

In [148]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [149]:
vectorizer = TfidfVectorizer(stop_words='english')

In [150]:
data_tfidf = vectorizer.fit_transform(data_clean.fashiontext)
data_tdf = pd.DataFrame(data_tfidf.toarray(), columns=vectorizer.get_feature_names())
data_tdf.index = data_clean.index
data_tdf

Unnamed: 0,aa,aback,abandon,abbasi,abbey,abbot,abdomen,abet,abey,abject,...,zipper,zircon,zirconia,zo,zoa,zodiac,zombi,zone,zoo,zoom
fashionspotcontent,0.0,0.0,0.001086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002506,0.0,0.0,0.001754,0.000489,0.00058,0.000339,0.001737,0.000677,0.001504
justthedesign,0.000465,0.000381,0.001342,0.000381,0.000238,0.0,0.000465,0.0,0.00093,0.000465,...,0.001787,0.000233,0.000138,0.001787,0.0,0.00069,0.000805,0.003303,0.000322,0.003336
mariecuriefashiontext,0.0,0.0,0.00081,0.0,0.004675,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002166,0.0,0.0,0.0,0.0,0.002431,0.0,0.010286
refinery29,0.0,0.0,9.8e-05,0.0,0.000113,0.00066,0.0,0.0,0.0,0.0,...,0.00169,0.0,0.0,0.005295,0.0,0.0,0.0,0.000683,0.0,0.0
vogueIndia,0.0,0.000179,0.001552,0.000179,0.00056,0.0,0.0,0.000218,0.0,0.0,...,0.001455,0.0,0.000259,0.002015,0.0,0.002074,0.000303,0.002716,0.000151,0.0075
voguefashiontext,0.0,0.0,0.001169,0.0,0.002024,0.0,0.0,0.0,0.0,0.0,...,0.002699,0.0,0.001563,0.000675,0.0,0.000781,0.0,0.002923,0.0,0.006073


In [151]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.fashiontext)
data_matrix = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_matrix.index = data_clean.index
data_matrix

Unnamed: 0,aa,aback,abandon,abbasi,abbey,abbot,abdomen,abet,abey,abject,...,zipper,zircon,zirconia,zo,zoa,zodiac,zombi,zone,zoo,zoom
fashionspotcontent,0,0,5,0,0,0,0,0,0,0,...,10,0,0,7,1,2,1,8,2,6
justthedesign,2,2,13,2,2,0,2,0,4,2,...,15,1,1,15,0,5,5,32,2,28
mariecuriefashiontext,0,0,1,0,5,0,0,0,0,0,...,0,0,2,0,0,0,0,3,0,11
refinery29,0,0,1,0,1,3,0,0,0,0,...,15,0,0,47,0,0,0,7,0,0
vogueIndia,0,1,16,1,5,0,0,1,0,0,...,13,0,2,18,0,16,2,28,1,67
voguefashiontext,0,0,2,0,3,0,0,0,0,0,...,4,0,2,1,0,1,0,5,0,9


In [152]:
#data_tdf.to_pickle("documenttermmatrixtdf.pkl")

In [153]:
import pickle
#pickle.dump(cv, open("cv.pkl", "wb"))