### Exercise 2.12: Extracting General Features from Text
In this exercise, we will extract various general features from documents. The dataset that we will be using here consists of random statements. Our objective is to find the frequency of various general features such as punctuation, uppercase and lowercase words, letters, digits, words, and whitespaces.

In [1]:
import pandas as pd
from string import punctuation
import nltk
nltk.download('tagsets')
from nltk.data import load
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import word_tokenize
from collections import Counter

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/LNonyane/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/LNonyane/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Kinds of speech provided by nltk
def get_tagsets():
    tagdict = load('help/tagsets/upenn_tagset.pickle')
    return list(tagdict.keys())
tag_list = get_tagsets()
print(tag_list)

['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']


In [8]:
# Count occurrence of pos tags in each sentence.
def get_pos_occurrence_freq(data, tag_list):
    # Get list of sentences in text_list
    text_list = data.text
    
    # Create empty dataframe
    feature_df = pd.DataFrame(columns=tag_list)
    for text_line in text_list:
        
        # Get pos tags of each word.
        pos_tags = [j for i, j in pos_tag(word_tokenize(text_line))]
        
        # Dict of pos tags and their frequency in given sentence.
        row = dict(Counter(pos_tags))
        feature_df = feature_df.append(row, ignore_index=True)
    feature_df.fillna(0, inplace=True)
    return feature_df

tag_list = get_tagsets()
data = pd.read_csv('data.csv', header=0)
feature_df = get_pos_occurrence_freq(data, tag_list)
feature_df.head()

  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)
  feature_df = feature_df.append(row, ignore_index=True)


Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,MD,VB,WRB,NNP,EX,NNS,SYM,CC,CD,POS
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Calculate the number of punctuation marks
def add_punctuation_count(feature_df, data):
    feature_df['num_of_unique_punctuations'] = data['text'].apply(lambda x: len(set(x).intersection(set(punctuation))))
    return feature_df
feature_df = add_punctuation_count(feature_df, data)
feature_df['num_of_unique_punctuations'].head()

0    0
1    0
2    1
3    1
4    0
Name: num_of_unique_punctuations, dtype: int64

In [10]:
# Calculate number of capitalized words
def get_capitalized_word_count(feature_df, data):
    # Tokenize text in every row; create set of only capital words; find length of set and add to column 'number_of_capital_words'
    feature_df['number_of_capital_words'] = data['text'].apply(lambda x: len([word for word in word_tokenize(str(x)) if word[0].isupper()]))
    return feature_df
feature_df = get_capitalized_word_count(feature_df, data)
feature_df['number_of_capital_words'].head()

0    1
1    1
2    1
3    1
4    1
Name: number_of_capital_words, dtype: int64

In [11]:
# Calculate number of letters in Dataframe
def get_number_of_alphabets(feature_df, data):
    feature_df['number_of_alphabets'] = data['text'].apply(lambda x: len([ch for ch in str(x) if ch.isalpha()]))
    return feature_df
feature_df = get_number_of_alphabets(feature_df, data)
feature_df['number_of_alphabets'].head()

0    19
1    18
2    28
3    14
4    13
Name: number_of_alphabets, dtype: int64

In [12]:
feature_df.head()

Unnamed: 0,LS,TO,VBN,'',WP,UH,VBG,JJ,VBZ,--,...,NNP,EX,NNS,SYM,CC,CD,POS,num_of_unique_punctuations,number_of_capital_words,number_of_alphabets
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,19
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,18
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,28
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,1,14
4,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,13
