<a href="https://colab.research.google.com/github/manjunath-hanmantgad/python-development/blob/master/Extracting_Features_from_Text_Variables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(subset='train')
df = pd.DataFrame(data.data, columns=['text'])
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [4]:
# number of characters
df['num_of_char'] = df['text'].str.len()
df.head()

Unnamed: 0,text,num_of_char
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,721
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,858
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,1981
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,815
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,1120


In [8]:
# number of words

# before that removing whitespaces
df['num_of_char'] = df['text'].str.strip().str.len()
#df.head()

df["num_of_words"] = df["text"].str.split().str.len()
df.head()

Unnamed: 0,text,num_of_char,num_of_words
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,716,123
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,857,123
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,1980,339
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,814,113
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,1117,171


In [9]:
# unique words
df['num_vocab'] = df['text'].str.lower().str.split().apply(
    set).str.len()
df.head()

Unnamed: 0,text,num_of_char,num_of_words,num_vocab
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,716,123,93
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,857,123,99
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,1980,339,219
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,814,113,96
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,1117,171,139


### text complexity by counting sentences

In [10]:
from nltk.tokenize import sent_tokenize
text = """
The alarm rang at 7 in the morning as it usually did on Tuesdays. She rolled over, stretched her arm, and stumbled to the button till she finally managed to switch it off. Reluctantly, she got up and went for a shower. The water was cold as the day before the engineers did not manage to get the boiler working. Good thing it was still summer.
Upstairs, her cat waited eagerly for his morning snack. Miaow! He voiced with excitement as he saw her climb the stairs.
"""
sent_tokenize(text)

['\nThe alarm rang at 7 in the morning as it usually did on Tuesdays.',
 'She rolled over, stretched her arm, and stumbled to the button till she finally managed to switch it off.',
 'Reluctantly, she got up and went for a shower.',
 'The water was cold as the day before the engineers did not manage to get the boiler working.',
 'Good thing it was still summer.',
 'Upstairs, her cat waited eagerly for his morning snack.',
 'Miaow!',
 'He voiced with excitement as he saw her climb the stairs.']

In [11]:
#count number of sentences in text
len(sent_tokenize(text))

8

In [12]:
# load the train subset of the 20 Newsgroup dataset into a pandas DataFrame:
data = fetch_20newsgroups(subset='train')
df = pd.DataFrame(data.data, columns=['text'])
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [13]:
df = df.loc[1:10] # using only 10 rows

In [15]:
df

Unnamed: 0,text
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...
5,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...
6,From: bmdelane@quads.uchicago.edu (brian manni...
7,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...
8,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...
9,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...


In [16]:
'''
 remove the first part of the text, which contains information about the email sender, subject, 
 and other details we are not interested in. 
 Most of this information comes before the word Lines followed by :, 
 so let’s split the string at Lines: and capture the second part of the string:
'''

df['text'] = df['text'].str.split('Lines:').apply(lambda x: x[1])
df

Unnamed: 0,text
1,11\nNNTP-Posting-Host: carson.u.washington.ed...
2,"36\n\nwell folks, my mac plus finally gave up..."
3,14\nDistribution: world\nNNTP-Posting-Host: a...
4,23\n\nFrom article <C5owCB.n3p@world.std.com>...
5,58\n\nIn article <1r1eu1$4t@transfer.stratus....
6,12\n\nThere were a few people who responded t...
7,44\nDistribution: world\nNNTP-Posting-Host: d...
8,10\n\nI have win 3.0 and downloaded several i...
9,29\n\njap10@po.CWRU.Edu (Joseph A. Pellettier...
10,13\n\nI have a line on a Ducati 900GTS 1978 m...


In [17]:
#create a variable that captures the number of sentences per text variable:
df['num_of_sent'] = df['text'].apply(sent_tokenize).apply(len)
df

Unnamed: 0,text,num_of_sent
1,11\nNNTP-Posting-Host: carson.u.washington.ed...,6
2,"36\n\nwell folks, my mac plus finally gave up...",9
3,14\nDistribution: world\nNNTP-Posting-Host: a...,7
4,23\n\nFrom article <C5owCB.n3p@world.std.com>...,10
5,58\n\nIn article <1r1eu1$4t@transfer.stratus....,21
6,12\n\nThere were a few people who responded t...,8
7,44\nDistribution: world\nNNTP-Posting-Host: d...,15
8,10\n\nI have win 3.0 and downloaded several i...,3
9,29\n\njap10@po.CWRU.Edu (Joseph A. Pellettier...,12
10,13\n\nI have a line on a Ducati 900GTS 1978 m...,11


### Implementing term frequency-inverse document frequency