## Mining most correlated nutrients

# DEFINE

#### ---Define the problem ---

### The dataset is only to be used as a proving ground to show off what techniques one might use to clean, wrangle and visualize data. 

In [231]:
#import everything we might need
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

__author__ = "Skyler Bullard"
__email__ = "skylercarcom@gmail.com"

### Load the data

In [232]:
    """
    Inspiration drawn from the dataset found on Niharika Pandit's Kaggle page 
    under the "Nutritional Facts for most common foods" project.
    """

data = pd.read_csv( 'nutrients_csvfile.csv' )

### The letter 't' is found in a numerical feature.

In [233]:
data.describe()

Unnamed: 0,Food,Measure,Grams,Calories,Protein,Fat,Sat.Fat,Fiber,Carbs,Category
count,335,335,335,334,335,335,333,335,335,335
unique,329,61,103,152,40,45,38,57,80,16
top,Butter,1 cup,100,100,1,t,0,0,0,"Breads, cereals, fastfood,grains"
freq,3,120,38,14,53,115,174,116,45,45


### Some features include null objects.

In [234]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 10 columns):
Food        335 non-null object
Measure     335 non-null object
Grams       335 non-null object
Calories    334 non-null object
Protein     335 non-null object
Fat         335 non-null object
Sat.Fat     333 non-null object
Fiber       335 non-null object
Carbs       335 non-null object
Category    335 non-null object
dtypes: object(10)
memory usage: 26.2+ KB


### Search for missing data and locate each index

In [235]:
def missing_search( df , col ):
        
    count = 0
        
    index_dict = {}
        
    missing_series = df[ col ].isnull()
        
    for i in missing_series:
            
        if i == True:
                
            count += 1
                
            index_dict[ " Count " ] = count
                
            index_dict[ " Index " ] = missing_series[ missing_series == i ]
                
    return index_dict

In [236]:
def missing_rows( df ):
        
    headers = df.columns.tolist()
        
    for name in headers:
            
        print( missing_search( df , name ) )

In [237]:
missing_rows( data )

{}
{}
{}
{' Count ': 1, ' Index ': 134    True
Name: Calories, dtype: bool}
{}
{}
{' Count ': 2, ' Index ': 42     True
100    True
Name: Sat.Fat, dtype: bool}
{}
{}
{}


In [238]:
data_to_drop = data.iloc[ 134 ]
    
print( data_to_drop )

Food           Frozen peas
Measure              1 cup
Grams                  100
Calories               NaN
Protein                  5
Fat                      t
Sat.Fat                  0
Fiber                  1.8
Carbs                   12
Category    Vegetables R-Z
Name: 134, dtype: object


In [239]:
data_to_drop = data.iloc[ 100 ]
    
print( data_to_drop )

Food             Beetroots
Measure              1 cup
Grams                  165
Calories                 1
Protein                 12
Fat                      0
Sat.Fat                NaN
Fiber                    t
Carbs                 0.80
Category    Vegetables A-E
Name: 100, dtype: object


In [240]:
data_to_drop = data.iloc[ 42 ]
    
print( data_to_drop )

Food            Salt pork
Measure             2 oz.
Grams                  60
Calories              470
Protein                 3
Fat                    55
Sat.Fat               NaN
Fiber                   0
Carbs                   0
Category    Meat, Poultry
Name: 42, dtype: object


In [241]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335 entries, 0 to 334
Data columns (total 10 columns):
Food        335 non-null object
Measure     335 non-null object
Grams       335 non-null object
Calories    334 non-null object
Protein     335 non-null object
Fat         335 non-null object
Sat.Fat     333 non-null object
Fiber       335 non-null object
Carbs       335 non-null object
Category    335 non-null object
dtypes: object(10)
memory usage: 26.2+ KB


In [242]:
def drop_rows( df , index_array ):
    
    for i in index_array:
        
        df.drop( df.index[ i ] , axis = 0  , inplace = True )

In [243]:
ind_arr = [ 134 , 42 , 100 ]

drop_rows( data , ind_arr )

In [244]:
data_to_drop = data.iloc[ 134 ]
    
print( data_to_drop )

Food        Peppers canned
Measure              1 pod
Grams                   38
Calories                10
Protein                  t
Fat                      t
Sat.Fat                  0
Fiber                    t
Carbs                    2
Category    Vegetables R-Z
Name: 137, dtype: object


In [245]:
data_to_drop = data.iloc[ 100 ]
    
print( data_to_drop )

Food        Brussels sprouts
Measure                1 cup
Grams                    130
Calories                  60
Protein                    6
Fat                        t
Sat.Fat                    0
Fiber                    1.7
Carbs                     12
Category      Vegetables A-E
Name: 102, dtype: object


In [246]:
data_to_drop = data.iloc[ 42 ]
    
print( data_to_drop )

Food                Bacon
Measure          2 slices
Grams                  16
Calories               95
Protein                 4
Fat                     8
Sat.Fat                 7
Fiber                   0
Carbs                   1
Category    Meat, Poultry
Name: 43, dtype: object


In [247]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 332 entries, 0 to 334
Data columns (total 10 columns):
Food        332 non-null object
Measure     332 non-null object
Grams       332 non-null object
Calories    332 non-null object
Protein     332 non-null object
Fat         332 non-null object
Sat.Fat     331 non-null object
Fiber       332 non-null object
Carbs       332 non-null object
Category    332 non-null object
dtypes: object(10)
memory usage: 28.5+ KB
