# Tutorial

## Detect implicit features from a data set.

### Import modules

In [6]:
"""
Import feature_mining module.
Import ParseAndModel.
"""
import feature_mining
from feature_mining import ParseAndModel
from feature_mining import EmVectorByFeature
from feature_mining import GFLM
import pandas as pd
import en_core_web_sm
from pprint import pprint

## Load the demo files

In [7]:
# Create a model based on a predefined list of features and an input data file.
feature_list=["sound", "battery", ["screen", "display"]]
filename='./iPod.final'
pm = ParseAndModel(feature_list=feature_list,   # list of features
                   filename = filename,         # file with input data
                   nlines=100)                  # number of lines to read

print(pm.model_results.keys())

dict_keys(['model_background', 'model_feature', 'section_word_counts_matrix', 'model_background_matrix', 'model_feature_matrix', 'vocabulary_lookup'])


Load model from test files.
For a real-life dataset, download this file: https://raw.githubusercontent.com/nfreundlich/CS410_CourseProject/dev/tests/data/parse_and_model/iPod.final

## Inspect the model

In [8]:
# Keys in the model dictionary
print(pm.model_results.keys())

# Language background model
print("Model background")
pprint(pm.model_results['model_background'][0:7])

# Feature model
print("Feature model")
pprint(pm.model_results['model_feature'][0][0:2])

# Word counts per section matrix (sentence/line)
print("Section word counts matrix (sentence/line) - sparse")
pprint(pm.model_results['section_word_counts_matrix'][0][0:2])

# Background model matrix - sparse
print("Background model matrix - sparse")
pprint(pm.model_results['model_background_matrix'][0][0:2])

# Feature model matrix
print("Feature model matrix")
pprint(pm.model_results['model_feature_matrix'][0:2][0:])

# Vocabulary words
print("Vocabulary words")
pprint(pm.model_results['vocabulary_lookup'][0])

dict_keys(['model_background', 'model_feature', 'section_word_counts_matrix', 'model_background_matrix', 'model_feature_matrix', 'vocabulary_lookup'])
Model background
[0.004310344827586207,
 0.004310344827586207,
 0.01293103448275862,
 0.05603448275862069,
 0.023706896551724137,
 0.0021551724137931034,
 0.017241379310344827]
Feature model
[0.0035684588810039313, 0.0035684588810039313]
Section word counts matrix (sentence/line) - sparse
<1x258 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>
Background model matrix - sparse
<1x258 sparse matrix of type '<class 'numpy.float64'>'
	with 258 stored elements in Compressed Sparse Row format>
Feature model matrix
array([[0.00356846, 0.00336429, 0.00363519],
       [0.00356846, 0.00336429, 0.00363519]])
Vocabulary words
'pleased'


### Launch Expectation Maximization on the features

In [9]:
print("Calling EMVectorByFeature")
em = EmVectorByFeature(explicit_model=pm,
                       max_iter=30)
em.em()

Calling EMVectorByFeature
EmVectorByFeature - base init...
EmVectorByFeature - base loop...
------------------------------------------------------------
-------------------- Running Iteration: 0 --------------------
------------------------------------------------------------
Elapsed on iteration: 0.008 seconds
------------------------------------------------------------
-------------------- Running Iteration: 1 --------------------
------------------------------------------------------------
Elapsed on iteration: 0.0067 seconds
------------------------------------------------------------
-------------------- Running Iteration: 2 --------------------
------------------------------------------------------------
Elapsed on iteration: 0.0059 seconds
------------------------------------------------------------
-------------------- Running Iteration: 3 --------------------
------------------------------------------------------------
Elapsed on iteration: 0.0096 seconds
---------------------

### Compute GFLM

In [14]:
gflm = GFLM(em_results=em, section_threshold=0.35, word_threshold=0.35)
gflm.calc_gflm_section()
gflm.calc_gflm_word()

print(gflm.gflm_word.head(20))
print(gflm.gflm_section.head(20))

    gflm_word  section_id  implicit_feature_id
0    0.380884           2                    0
1    0.748721          12                    0
2    0.653669          22                    0
3    0.569295          26                    0
4    0.358128          57                    0
5    0.359517          65                    0
6    0.406727          83                    0
7    0.357672          90                    0
8    0.353637          94                    0
9    0.354839          99                    0
10   0.553098          11                    1
11   0.737475          13                    1
12   0.737475          14                    1
13   0.428765          18                    1
14   0.434852          19                    1
15   0.400439          37                    1
16   0.400454          43                    1
17   0.554452          44                    1
18   0.393325          47                    1
19   0.479416          50                    1
    gflm_sect

## TODO: Display tagged text

## TODO: Call only the wrapper function