# Basics: [DataFrames](https://www.datacamp.com/community/tutorials/pandas-tutorial-dataframe-python#gs.Ulu69Pg)

In [None]:
from __future__ import absolute_import, division, print_function
import pandas as pd
import numpy as np

## Creation

In [None]:
arr = np.array([[1, 2, 3], [4, 5, 6]])
print("From a numpy array:", pd.DataFrame(arr), sep="\n")

dic = {1: ['1', '3'], 2: ['1', '2']}
print("Frum a dictionary:", pd.DataFrame(dic), sep="\n")

df = pd.DataFrame(data=[4, 5, 6, 7], index=range(0, 4), columns=['A'])
print("From a df:", pd.DataFrame(df), sep="\n")

In [None]:
np.info(df.drop)

## Inspection

In [None]:
df = pd.DataFrame(np.array([[1,2,3], [4, 5, 6]]))
print("Shape:", df.shape)
print("Height:", len(df.index), end="\n\n") 

## Indexing

In [None]:
# Define example df:
arr = np.arange(1, 10).reshape(3, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
print("The examples below all access the top-left '1' in . . . \n", df.head(), "\n")

# iloc and loc are the 2 main ones.
df.iloc[0][0]
df.loc[0]['A']
v
df.at[0, 'A']
df.iat[0, 0]
df.get_value(0, 'A')

## Expanding/Removing/Changing Data  from Rows/Columns

#### Adding a New Column

In [None]:
# Append list as column.
new_df = df.assign(new_col = pd.Series(np.random.randint(10, size=len(df))).values)
# Append column with name 'name'. Values initialized to row index. 
new_df['name'] = new_df.index
new_df.head()

## Changing Data

#### Renaming Columns

In [None]:
# Check out your DataFrame `df`
print(df)

# Define the new names of your columns
newcols = {
    'A': 'new_column_1', 
    'B': 'new_column_2', 
    'C': 'new_column_3'
}

# Use `rename()` to rename your columns
df.rename(columns=newcols, inplace=True)

# Rename your index
df.rename(index={1: 'a'})

#### Replacing String Patterns with Regex

In [None]:
# Check out your DataFrame `df`
arr = np.arange(1, 10).reshape(3, 3)
arr = [[str(r) for r in row] for row in arr]
arr[0][1] += '\n'
arr[1][0] += '\n'
arr[2][2] += '\n'
df = pd.DataFrame(arr)
print("Before:\n", df)

df.replace({'\n': '<br>'}, regex=True, inplace=True)
print("After:\n", df)

In [None]:
print(df)
# Replace strings by others with `regex`
df.replace({'\n': '<br>'}, regex=True, inplace=True)
print(df)

# Customizing Display Options 

In [None]:
df = pd.DataFrame({'text': ['foo foo foo foo foo foo foo foo', 'bar bar bar bar bar'],
                 'number': [1, 2]})
df.head()

In [None]:
df.style.set_properties(subset=['text'], **{'width': '800px'})
np.info(df.style.set_properties)

# Saving and Loading Data

In [None]:
import pandas as pd
import numpy as np
loss = [0.12, 0.165]
learning_rate= [0.5, 0.1]
df = pd.DataFrame({"loss": loss, "learning_rate": learning_rate})
df.head()

#### Creating/Appending and Saving DF to File
Creates file if exists, else appends to existing one. Useful for repeated updates to data files.

In [None]:
# When opening to new and/or empty file, do . . . 
with open('io_test.csv', 'a+') as f:
    df.to_csv(f)
# When you know the file already exists & isn't empty, do . . . 
with open('io_test.csv', 'a') as f:
    # Don't include header in appended content.
    df.to_csv(f, header=False)

In [None]:
%%bash
cat io_test.csv

#### Loading From CSV Into DF

In [None]:
df_load = pd.read_csv('io_test.csv', index_col=0)
df_load.head()

In [None]:
df_load = df_load.append({"loss":200, "learning_rate":0.01}, ignore_index=True)
df_load.head()

# Test Outputs in EDP

In [None]:
import nltk
import os
import re
import time
import json
import enchant
import multiprocessing
import sys
if os.getcwd() == '/home/brandon/Documents/seq2seq_projects/notebooks':
    sys.path.append('..')
from data import data_helper
import numpy as np
import pandas as pd

from data import DataHelper
from functools import wraps
from pprint import pprint
from itertools import chain
from collections import Counter
from multiprocessing import Pool
from progressbar import ProgressBar

# Global helper object that helps abstract away locations of
# files & directories, and keeps an eye on memory usage.
data_helper = DataHelper()
# Max number of words in any saved sentence.
MAX_SEQ_LEN = 11
# Number of CPU cores available.
NUM_CORES = 8
# How many chunks we should split dataframes into at any given time.
NUM_PARTITIONS = 256

In [None]:
def timed_function(*expected_args):
    """Simple decorator to show how long the functions take to run."""
    def decorator(fn):
        @wraps(fn)
        def wrapper(*args, **kwargs):
            start_time  = time.time()
            res         = fn(*args, **kwargs)
            stop_time   = time.time()
            fname = expected_args[0]
            print("Time to run %s: %.3f seconds." %
                  (fname, stop_time - start_time))
            return res
        return wrapper
    return decorator

@timed_function('parallel_map_list')
def parallel_map_list(fn, iterable):
    """Based on great explanation from 'Pandas in Parallel' (racketracer.com)."""
    iterable = np.array_split(iterable, NUM_PARTITIONS)
    pool = Pool(NUM_CORES)
    iterable = np.concatenate(pool.map(fn, iterable))
    pool.close()
    pool.join()
    return iterable


In [None]:
df = data_helper.safe_load(max_mem=1.0)

## Reddit API Info
[[Source]](www.github.com/reddit/reddit/wiki/JSON)

__Base class__

name | description 
----- | :------- 
id | this item's identifier (not english)
name | Fullname of comment (not english)


__Comments__: 

name | description 
----- | :------- 
author | account name of the poster (english)
body | raw comment text
link_id | ID of the link this comment is in
parent_id | ID of the thing this comment is a reply to, either the link or a comment in it




In [None]:
id_cols = df[['author', 'name', 'link_id', 'parent_id']]
id_cols.head()

### Hi Mitch: check dis error:

In [None]:
name_dist = nltk.FreqDist(id_cols.name.values)
name_dist.most_common()[-10:]

In [None]:
df[df.name == 't1_c02afvg']

In [None]:
def root_comments(df):
    '''Build list determining which rows of df are root comments.

    Returns:
        list of length equal to the number of rows in our data frame.
    '''
    root_value = []
    # Iterate over DataFrame rows as namedtuples,
    # with index value as first element of the tuple.
    for row in df.itertuples():
        root_value.append(row.parent_id == row.link_id)
    return root_value

@timed_function('initial_clean')
def initial_clean(df):
    """Throw away columns we don't need and misc. style formatting."""
    df['root'] = root_comments(df)
    # TODO: Can probably remove 'subreddit' column.
    df = df[['author', 'body', 'link_id', 'parent_id', 'name', 'root', 'subreddit']]
    df.style.set_properties(subset=['body'], **{'width': '500px'})
    df.style.set_properties(**{'text-align': 'left'})
    df.head()
    return df
df = initial_clean(df)

In [None]:
df.head()

In [None]:
print('len(df.index) =', len(df.index))
df.describe()

## Regex Replacements

In [None]:
@timed_function('regex_replacements')
def regex_replacements(df):
    # Remove comments that are '[deleted]'.
    df = df.loc[df.body != '[deleted]'].reset_index(drop=True)
    df.style.set_properties(subset=['body'], **{'width': '800px'})
    # Make all comments lowercase to help reduce vocab size.
    df['body'] = df['body'].map(lambda s: s.strip().lower())
    # Loop over regex replacements specified by modify_list.
    for old, new in data_helper.modify_list.items():
        df['body'].replace({old: new}, regex=True, inplace=True)
    # Remove comments with this extremely common occurrence.
    #df = df.loc[df.body != 'NUMBER'].reset_index(drop=True)
    return df
df = regex_replacements(df)

In [None]:
print('len(df.index) =', len(df.index))
df.head()

In [None]:
df.describe()

## Remove Large Comments

In [None]:
@timed_function('remove_large_comments')
def remove_large_comments(max_len, df):
    # Could probably do a regex find on spaces to make this faster.
    df = df[df['body'].map(lambda s: len(s.split())) < max_len].reset_index(drop=True)
    df = df[df['body'].map(lambda s: 'http' not in s)].reset_index(drop=True)
    return df
df = remove_large_comments(max_len=MAX_SEQ_LEN, df=df)

In [None]:
print('len(df.index) =', len(df.index))
df.head()

In [None]:
df.describe()

## Expand Contractions

In [None]:
@timed_function('expand_contractions')
def expand_contractions(df):
    """Replace all contractions with their expanded form."""
    for contraction, as_words in data_helper.contractions.items():
        df['body'].replace({contraction: as_words}, regex=True, inplace=True)
    return df
df = expand_contractions(df)

In [None]:
print('len(df.index) =', len(df.index))
df.head()

In [None]:
df.describe()

In [None]:
@timed_function('children_dict')
def children_dict(df):
    """Returns a dictionary with keys being the root comments and
    values being their immediate root_to_children. Assumes that df has 'root' column.

    Go through all comments. If it is a root, skip it since they wont have a parent_id
    that corresponds to a comment.
    """
    children = {}
    for row in df.itertuples():
        if row.root == False:
            if row.parent_id in children.keys():
                children[row.parent_id].append(row.name)
            else:
                children[row.parent_id] = [row.name]
    return children

In [None]:
sentences = parallel_map_list(fn=DataHelper.word_tokenizer, iterable=df.body.values)

In [None]:
freq_dist = nltk.FreqDist(chain.from_iterable(sentences))
n = 30
print("Top %d most common words:" % n)
pprint(freq_dist.most_common(n))

In [None]:
print("Words that frequently appear together:")
text = nltk.Text(chain.from_iterable(sentences))
text.collocations()

In [None]:
data_helper.set_word_freq(Counter(chain.from_iterable(sentences)))

## Sentence Scoring

In [None]:
def sentence_score(sentences):
    word_freq = data_helper.word_freq
    d = enchant.Dict('en_US')

    scores = []
    for sentence in sentences:
        word_count = len(sentence) + 1e-20
        sent_score = sum([1.0 / ((word_freq[w] + 1e-20) * word_count)
                      for w in sentence if not d.check(w)])
        scores.append(sent_score)
    return scores

In [None]:
df['score'] = parallel_map_list(fn=sentence_score, iterable=sentences)

In [None]:
df['score'].describe()

In [None]:
rand_gen = DataHelper.random_rows_generator(10, len(df))
df.head()

In [None]:
df.loc[next(rand_gen)].head()

## Dictionaries to Save

In [None]:
df.columns

In [None]:
df.body.values.shape

In [None]:
len(set(df.name.values.tolist()))

In [None]:
len(df.name.values.tolis
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    t())