## Imports

In [1]:
# !pip3 install os
from os import listdir
import string
from pickle import dump,load

## Loading the Data

In [2]:
class LoadData:
    def __init__(self, directory):
        self.directory= directory
        
    def load_stories(self):
        """
        Load the data and store it in a list of dictionaries
        
        """
        all_stories= list()
        
        def load_doc(filename):
            """
            Return the data from a given filename
            """
            file = open(filename, encoding='utf-8')
            text = file.read()
            file.close()
            return text
        
        def split_story(doc):
            """
            Split story from summaries based on the separater -> "@highlight"
            """
            index = doc.find('@highlight')
            story, highlights = doc[:index], doc[index:].split('@highlight')
            highlights = [h.strip() for h in highlights if len(h) > 0]
            return story, highlights
        
        list_of_files= listdir(self.directory)
        for name in list_of_files[:1000]:
            filename = self.directory + '/' + name
            doc = load_doc(filename)
            story, highlights= split_story(doc)
            all_stories.append({'story': story, 'highlights': highlights})
        
        return all_stories

In [3]:
DIR_PATH= "/home/nikhil/Downloads/cnn/stories"
obj= LoadData(DIR_PATH)
stories= obj.load_stories()

In [4]:
len(stories)

1000

In [5]:
print(stories[10]['highlights'])
print()
print(stories[10]['story'])

['Shirley Sotloff pleads directly to the leader of ISIS', '"Please release my child," she says', 'Steven Sotloff disappeared while reporting in Syria last year']

A mother's plea to the terrorists holding her son hostage: No individual should be punished for events he cannot control.

The mother is Shirley Sotloff, and she speaks directly to ISIS leader  Abu Bakr al-Baghdadi in a video broadcast Wednesday on Al Arabiya Network.

Her son, freelance journalist Steven Sotloff, appeared last week in an ISIS video showing the decapitation of American journalist James Foley.

The militant in the video warns that Steven Sotloff's fate depends on what President Barack Obama does next in Iraq.

A day after the video was posted, Obama vowed that the United States would be "relentless" in striking back against ISIS.

"Steven is a journalist who traveled to the Middle East to cover the suffering of Muslims at the hands of tyrants. Steven is a loyal and generous son, brother and grandson," Shirley 

In [6]:
stories[:2]

[{'highlights': ['168 police officers and 67 firefighters are laid off in Camden, New Jersey',
   "City's mayor said she couldn't get $8 million in budget concessions to save jobs",
   'The mayor had been asking police and firefighters for concessions',
   'They were asked to pay more for health care and accept salary freezes or reductions'],
  'story': '(CNN) -- The mayor of crime-ridden Camden, New Jersey, has announced layoffs of nearly half of the city\'s police force and close to a third of its fire department.\n\nOne hundred sixty-eight police officers and 67 firefighters were laid off Tuesday, as officials struggle to close a $26.5 million budget gap through a series of belt-tightening measures, Mayor Dana Redd told reporters. The layoffs take effect immediately.\n\nRedd said she was unable to secure the $8 million in budget concessions that she says she needed to save the jobs of up to 100 police officers and many of the city\'s firefighters.\n\nThe mayor -- who said she will c

## Data Cleaning

In [8]:
class Clean_data:
    def __init__(self):
        pass
           
    def clean_lines(self, lines):
        cleaned = list()
        table = str.maketrans('', '', string.punctuation)
        
        for line in lines:
            index = line.find('(CNN)')
            if index >= 0:
                line = line[index + len('(CNN)'):]

            split_line = line.split()
            
            split_line = [word.lower() for word in split_line]
            split_line = [w.translate(table) for w in split_line]
            
            split_line = [word for word in split_line if word.isalpha()]
            cleaned.append(' '.join(split_line))
        cleaned = [c for c in cleaned if len(c) > 0]
        return cleaned

In [12]:
obj1= Clean_data()
cleaned_stories= list()
for example in stories[:100]:
    cleaned_stories.append({'story': obj1.clean_lines(example['story'].split('\n')), 'highlights': obj1.clean_lines(example['highlights'])})    

In [13]:
cleaned_stories[60]

{'highlights': ['stepmother claims girls father dismembered body',
  'a judge releases search warrants from detectives probing zahra bakers death',
  'zahras remains were found november a month after she was reported missing'],
 'story': ['the stepmother of zahra baker told police the girl was killed two weeks before she was reported missing according to search warrants released tuesday',
  'stepmother elisa baker also told police in hickory north carolina that the disabled girls body was disposed of the next day september in various locations according to the documents',
  'she told police on november that the girls father adam baker dismembered the girl and the couple disposed of the remains',
  'while elisa baker has been charged with obstruction of justice for writing a fake ransom note and leaving it at the familys hickory home no one has been charged directly in the girls death elisa baker also is accused of writing worthless checks',
  'police have said she had been cooperating 

In [14]:
dump(cleaned_stories, open('/home/nikhil/Downloads/cnn/processed_sample_data/cnn_dataset.pkl', 'wb'))

In [15]:
cleaned_stories = load(open('/home/nikhil/Downloads/cnn/processed_sample_data/cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(cleaned_stories))

Loaded Stories 100


---

## Amazon Food reviews Dataset

## Imports

In [38]:
import pandas as pd
import numpy as np
import re

In [39]:
AMAZON_DATA_PATH= '/home/nikhil/Downloads/amazon-fine-food-reviews/Reviews.csv'

In [45]:
class Load_amazon_data:
    def __init__(self, dir_path, seed= 0):
        self.dir_path= dir_path
        np.random.seed(seed)
        
    def load(self):
        """
        Reads data from the given directory path
        """
        return pd.read_csv(self.dir_path)
    
    def drop(self):
        """
        Drops unnecessary columns
        """
        
        data= self.load()
        
        data = data.dropna()
        data= data.iloc[:, -2:]
        data = data.reset_index(drop= True)
        
        return data
    
    def analyze_data(self):
        """
        Prints some sample data points from the cleaned data
        """
        data= self.drop()
        
        for sr_no, i in enumerate(np.random.randint(10, 100, size= 5)):
            print("_________________________")
            print("Data Point {0}".format(sr_no + 1))
            print("Summary:")
            print(data['Summary'].iloc[i])
            print("Full Text:")
            print(data['Text'].iloc[i])

In [46]:
obj= Load_amazon_data(AMAZON_DATA_PATH, seed= 1)

## Load the Data

In [47]:
data= obj.load()
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Dropping Unnecessary columns

In [48]:
data= obj.drop()
data.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


## Analyze the data

In [49]:
obj.analyze_data()

_________________________
Data Point 1
Summary:
Mushy
Full Text:
The flavors are good.  However, I do not see any differce between this and Oaker Oats brand - they are both mushy.
_________________________
Data Point 2
Summary:
Delicious product!
Full Text:
I can remember buying this candy as a kid and the quality hasn't dropped in all these years. Still a superb product you won't be disappointed with.
_________________________
Data Point 3
Summary:
Forget Molecular Gastronomy - this stuff rockes a coffee creamer!
Full Text:
I know the product title says Molecular Gastronomy, but don't let that scare you off.  I have been looking for this for a while now, not for food science, but for something more down to earth.  I use it to make my own coffee creamer.<br /><br />I have to have my coffee blonde and sweet - but the flavored creamers are full of the bad kinds of fat, and honestly, I hate to use manufactured "food" items.  I really don't think they are good for the body.  On the other h

In [50]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [61]:
class Data_cleaning:
    def __init__(self):
        self.clean_summaries= []
        self.clean_texts= []

    def clean_text(self, text, remove_stopwords = False):
        """
        Defines a series of cleaning operations 
        """
        text = text.lower()

        if True:
            text = text.split()
            new_text = []
            for word in text:
                if word in contractions:
                    new_text.append(contractions[word])
                else:
                    new_text.append(word)
            text = " ".join(new_text)

        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'<br >', ' ', text)
        text = re.sub(r'<br  >', ' ', text)
        text = re.sub(r'\'', ' ', text)

        # Optionally, remove stop words
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)

        return text
    
    def clean(self, data):
        """
        Applies the clean_text() to the entire dataset
        """
        for summary in data.Summary:
            self.clean_summaries.append(self.clean_text(summary))

        print("Summaries are complete.")

        for text in data.Text:
            self.clean_texts.append(self.clean_text(text))

        print("Texts are complete.")
        
        return self.clean_summaries, self.clean_texts

In [62]:
# import nltk
# nltk.download('stopwords')

clean_obj= Data_cleaning()
clean_summaries, clean_texts= clean_obj.clean(data)

Summaries are complete.
Texts are complete.


In [63]:
np.random.seed(1)

for sr_no, i in enumerate(np.random.randint(10, 100, size= 5)):
    print("_________________________")
    print("Data Point #{0}".format(sr_no + 1))
    print("Summary:")
    print(clean_summaries[i])
    print("Full Text:")
    print(clean_texts[i])

_________________________
Data Point #1
Summary:
mushy
Full Text:
the flavors are good  however  i do not see any differce between this and oaker oats brand   they are both mushy 
_________________________
Data Point #2
Summary:
delicious product 
Full Text:
i can remember buying this candy as a kid and the quality has not dropped in all these years  still a superb product you will not be disappointed with 
_________________________
Data Point #3
Summary:
forget molecular gastronomy   this stuff rockes a coffee creamer 
Full Text:
i know the product title says molecular gastronomy  but do not let that scare you off  i have been looking for this for a while now  not for food science  but for something more down to earth  i use it to make my own coffee creamer   i have to have my coffee blonde and sweet   but the flavored creamers are full of the bad kinds of fat  and honestly  i hate to use manufactured  food  items  i really do not think they are good for the body  on the other hand  i