In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import data.get_data as data
import utils.text_manipulation as txtm

def sort_vocab(vocab_matrix):
    tuples = zip(vocab_matrix.col, vocab_matrix.data)
    tup_sorted = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
    return tup_sorted


def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """return n-gram counts in asc order of counts"""

    # use only topn items from vector
    sorted_items = sorted_items[-topn:]

    results = []

    for idx, count in sorted_items:
        # get the ngram name
        n_gram = feature_names[idx]
        results.append((n_gram, count))

    return results


def preprocess_stpw_clean_lem(df_train, df_test):

    df_train = txtm.preprocess_dataframe(df_train)
    df_test = txtm.preprocess_dataframe(df_test)

    return df_train, df_test

def ng_CountVectorizer(X_train, ngram_range=(1, 3)):
    """
    create CountVectorizer to validate in df the use of it.
    Parameters
    ----------
    X_train
    ngram_range

    Returns
    -------

    """
    # print('ngram_range=', ngram_range)
    count_vectorizer = CountVectorizer(stop_words=None, ngram_range=ngram_range)
    bag_of_words = count_vectorizer.fit_transform(X_train)

    # returns the Bag-of-Words Model as a pandas DataFrame
    feature_names = count_vectorizer.get_feature_names()
    df_bag_of_words = pd.DataFrame(bag_of_words.toarray(), columns=feature_names)

    return count_vectorizer, df_bag_of_words, bag_of_words


def ng_TfidfVectorizer(X_train, ngram_range):
    # create vectorizer out of words of questions
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(X_train)

    # Show the Model as a pandas DataFrame
    feature_names = tfidf_vectorizer.get_feature_names()
    df_tfidf_vectorizer = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

    # print(type(tfidf_matrix))
    # print(tfidf_matrix.shape)
    return df_tfidf_vectorizer, tfidf_matrix


def get_category_texts(df, category, df_col):
    """
    abstract category text request to test with different texts
    Parameters
    ----------
    df: data frame
    category: category to be returned
    df_col: df column text to process

    Returns
    df df col to process
    -------

    """
    return df[df.category == category][df_col]






[nltk_data] Downloading package wordnet to /home/an/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/an/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def get_category_ngrams(x_train, ngram_range=(1, 1), topn=10):
    count_vectorizer, df_bag_of_words, bag_of_words = ng_CountVectorizer(x_train, ngram_range)

    # Get full vocab names
    vocabulary = count_vectorizer.vocabulary_

    max_words = dict((word, df_bag_of_words[word].sum()) for word in vocabulary)
    smax_words = sorted(max_words, key=max_words.get, reverse=True)[:topn]

    #print('words:', smax_words)
    #print('words: wordcount ', end=' ')
    for word in smax_words:
        print(word, ':', max_words[word], end=', ')
    return smax_words, max_words

In [15]:
smax_words, max_words = get_category_ngrams(df_train['question'], ngram_range=(1, 1), topn=10)

the : 3775, what : 3377, is : 1681, of : 1548, in : 1207, how : 789, was : 653, to : 615, who : 614, are : 453, 

In [16]:
max_words

{'how': 789,
 'did': 388,
 'serfdom': 1,
 'develop': 2,
 'in': 1207,
 'and': 433,
 'then': 11,
 'leave': 6,
 'russia': 5,
 'what': 3377,
 'films': 6,
 'featured': 8,
 'the': 3775,
 'character': 33,
 'popeye': 4,
 'doyle': 1,
 'can': 191,
 'find': 86,
 'list': 10,
 'of': 1548,
 'celebrities': 3,
 'real': 15,
 'names': 26,
 'fowl': 1,
 'grabs': 1,
 'spotlight': 1,
 'after': 32,
 'chinese': 10,
 'year': 87,
 'monkey': 3,
 'is': 1681,
 'full': 10,
 'form': 17,
 'com': 12,
 'contemptible': 3,
 'scoundrel': 3,
 'stole': 5,
 'cork': 4,
 'from': 161,
 'my': 40,
 'lunch': 3,
 'team': 34,
 'baseball': 33,
 'st': 14,
 'louis': 5,
 'browns': 2,
 'become': 27,
 'oldest': 12,
 'profession': 5,
 'are': 453,
 'liver': 2,
 'enzymes': 1,
 'name': 331,
 'scar': 1,
 'faced': 3,
 'bounty': 4,
 'hunter': 4,
 'old': 39,
 'west': 20,
 'when': 190,
 'was': 653,
 'ozzy': 1,
 'osbourne': 1,
 'born': 31,
 'why': 105,
 'do': 336,
 'heavier': 2,
 'objects': 3,
 'travel': 12,
 'downhill': 1,
 'faster': 1,
 'who': 61

In [36]:
type(max_words)

dict

In [38]:
values = list(max_words.values())

keys = list(max_words.keys())

#{v: values[i] for i, v in enumerate(smax_words)}

list = []
    
for word in smax_words:
    if word in keys:
        list.append(values[])
    



['how',
 'did',
 'serfdom',
 'develop',
 'in',
 'and',
 'then',
 'leave',
 'russia',
 'what',
 'films',
 'featured',
 'the',
 'character',
 'popeye',
 'doyle',
 'can',
 'find',
 'list',
 'of',
 'celebrities',
 'real',
 'names',
 'fowl',
 'grabs',
 'spotlight',
 'after',
 'chinese',
 'year',
 'monkey',
 'is',
 'full',
 'form',
 'com',
 'contemptible',
 'scoundrel',
 'stole',
 'cork',
 'from',
 'my',
 'lunch',
 'team',
 'baseball',
 'st',
 'louis',
 'browns',
 'become',
 'oldest',
 'profession',
 'are',
 'liver',
 'enzymes',
 'name',
 'scar',
 'faced',
 'bounty',
 'hunter',
 'old',
 'west',
 'when',
 'was',
 'ozzy',
 'osbourne',
 'born',
 'why',
 'do',
 'heavier',
 'objects',
 'travel',
 'downhill',
 'faster',
 'who',
 'pride',
 'yankees',
 'killed',
 'gandhi',
 'considered',
 'costliest',
 'disaster',
 'insurance',
 'industry',
 'has',
 'ever',
 'sprawling',
 'state',
 'boasts',
 'most',
 'airports',
 'only',
 'repealed',
 'amendment',
 'to',
 'constitution',
 'deal',
 'with',
 'many',


In [31]:
def main():
    """
    Testing some function in file
    """
    # loading data and working with pd df
    df_train = data.get_train_data()
    df_test = data.get_test_data()

    # using cleaned df with augmented questions

    df_train, df_test = preprocess_stpw_clean_lem(df_train, df_test)

    #X_train = df_train['question'] # all full text

    categories = df_train.category.unique()
    print(categories)

    #################
    # default setting for ngram viz
    #################

    #df_col = 'question' # main without changes
    # df_col = 'text_stopwords'  # main without stopwords
    df_col = 'text_clean'  # main without stopwords and regex cleaned lowercase
    # df_col = 'text_lemma'  # main without stopwords, regex cleaned lowercase, lemma

    ngram_range = (1, 2)
    topn = 10
    
    ngram_ranges = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)]
   
    #df_max_words = pd.DataFrame(columns=ngram_ranges)
    
    for ngram_range in ngram_ranges:
        #print()
        #print('ngram_range:', ngram_range)
        #smax_words = get_category_ngrams(df_train['question'], ngram_range=ngram_range, topn=topn)
        
                
        for category in categories:
            #print()
            #print('Category:', category, end=' ')
            x_train = get_category_texts(df_train, category, df_col)
            smax_words = get_category_ngrams(x_train, ngram_range, topn)
           # df_max_words[ngram_range] = smax_words
            
    return df_max_words       

if __name__ == '__main__':
    main()


['DESC' 'ENTY' 'ABBR' 'HUM' 'NUM' 'LOC']
what : 749, how : 278, why : 104, mean : 62, origin : 54, get : 37, name : 32, difference : 32, word : 30, find : 27, 

  return array(a, dtype, copy=False, order=order)


what : 1112, name : 119, fear : 66, first : 50, kind : 43, which : 43, called : 41, used : 38, world : 37, film : 36, what : 81, stand : 41, abbreviation : 16, mean : 9, national : 5, bureau : 5, investigation : 5, acronym : 4, used : 3, cnn : 3, who : 559, what : 535, name : 144, the : 86, first : 82, president : 65, which : 46, company : 44, wrote : 38, world : 37, how : 479, many : 323, what : 245, when : 124, year : 68, much : 57, long : 56, people : 41, first : 35, take : 31, what : 524, where : 255, country : 123, city : 102, state : 65, world : 62, find : 52, largest : 46, name : 32, river : 26, what : 749, how : 278, why : 104, mean : 62, origin : 54, what origin : 53, get : 37, name : 32, difference : 32, what difference : 32, what : 1112, name : 119, fear : 66, what fear : 62, what name : 60, first : 50, kind : 43, which : 43, called : 41, used : 38, what : 81, stand : 41, abbreviation : 16, what abbreviation : 13, mean : 9, what stand : 8, national : 5, bureau : 5, investiga

In [4]:
# data
path_class_def  = 'https://cogcomp.seas.upenn.edu/Data/QA/QC/definition.html'
path_train_data = 'https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label'
path_test_data  = 'https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label'

In [5]:
# from folder data
# import data.get_data.process_question(row)
def process_question(row):
   '''join row to text''' 
   text =  " ".join(row.split(" ")[1:])
   return text

In [6]:
#load
train_df = pd.read_table(path_train_data, encoding = "ISO-8859-1", header=None)
train_df.columns = ["raw"]
train_df['category'] = train_df.apply (lambda row: row["raw"].split(":")[0], axis=1)
train_df['subcategory'] = train_df.apply (lambda row: row["raw"].split(" ")[0].split(":")[1], axis=1)
train_df['question'] = train_df.apply (lambda row: process_question(row["raw"]), axis=1)

train_df.head(2)

df_train = train_df

In [7]:
ng_countvec = ng_CountVectorizer(df_train['question'], ngram_range=(1, 3))

(CountVectorizer(ngram_range=(1, 3)),
       000  000 check  000 for  000 for an  000 in  000 in the  000 miles  \
 0       0          0        0           0       0           0          0   
 1       0          0        0           0       0           0          0   
 2       0          0        0           0       0           0          0   
 3       0          0        0           0       0           0          0   
 4       0          0        0           0       0           0          0   
 ...   ...        ...      ...         ...     ...         ...        ...   
 5447    0          0        0           0       0           0          0   
 5448    0          0        0           0       0           0          0   
 5449    0          0        0           0       0           0          0   
 5450    0          0        0           0       0           0          0   
 5451    0          0        0           0       0           0          0   
 
       000 patents  000 patents to  

In [8]:
df_train.head()

Unnamed: 0,raw,category,subcategory,question
0,DESC:manner How did serfdom develop in and the...,DESC,manner,How did serfdom develop in and then leave Russ...
1,ENTY:cremat What films featured the character ...,ENTY,cremat,What films featured the character Popeye Doyle ?
2,DESC:manner How can I find a list of celebriti...,DESC,manner,How can I find a list of celebrities ' real na...
3,ENTY:animal What fowl grabs the spotlight afte...,ENTY,animal,What fowl grabs the spotlight after the Chines...
4,ABBR:exp What is the full form of .com ?,ABBR,exp,What is the full form of .com ?


In [9]:
get_category_ngrams(df_train['question'], ngram_range=(1, 2), topn=10)

words: ['the', 'what', 'is', 'of', 'in', 'what is', 'how', 'is the', 'was', 'to']
words: wordcount  the : 3775, what : 3377, is : 1681, of : 1548, in : 1207, what is : 971, how : 789, is the : 763, was : 653, to : 615, 

['the', 'what', 'is', 'of', 'in', 'what is', 'how', 'is the', 'was', 'to']

In [10]:
smax_words = get_category_ngrams(df_train['question'], ngram_range=(1, 2), topn=10)

words: ['the', 'what', 'is', 'of', 'in', 'what is', 'how', 'is the', 'was', 'to']
words: wordcount  the : 3775, what : 3377, is : 1681, of : 1548, in : 1207, what is : 971, how : 789, is the : 763, was : 653, to : 615, 

In [11]:
smax_words

['the', 'what', 'is', 'of', 'in', 'what is', 'how', 'is the', 'was', 'to']

In [12]:
df0['(1,2)'] = smax_words
df0

NameError: name 'df0' is not defined

In [None]:
df_max_words = main()