# To Explore and Process the data



In [51]:
import pandas as pd

In [None]:
path_pos = "./vocab_cut_pos.txt"
path_neg = "./vocab_cut_neg.txt"

path_pos_full = "./vocab_cut_pos_full.txt"
path_neg_full = "./vocab_cut_neg_full.txt"

path_test_data = "./vocab_test_data.txt"

tweets_pos = "./"

In [None]:
def build_df(filepath):
    """return a dataframe which is a mapping of words in tweets
    with their occurences in all tweets
    take the path of the file of the tweets
    """
    
    df = pd.read_table(filepath_or_buffer = filepath, header=None, names=["word"])
    df["occurence"] = df["word"].map(lambda x:  int(x.split()[0]))
    df["word"] = df["word"].map(lambda x:  x.split()[1])
    return df

#### -------------- Partial tweets ----------------------------------
# build the DF
pos = build_df(path_pos)
neg = build_df(path_neg)

#### -------------- Full tweets ----------------------------------
pos_full = build_df(path_pos_full)
neg_full = build_df(path_neg_full)

#### -------------- Train tweets ----------------------------------

test_data = build_df(path_test_data)

In [None]:
print("Neg full shape should be 78028 is : "+str(neg_full.shape[0]))
print("Pos full shape should be 46009 is : "+str(pos_full.shape[0]))
print("Neg shape should be 16418 is : "+str(neg.shape[0]))
print("Pos shape should be 9604 is : "+str(pos.shape[0]))

In [None]:
def merging(neg, pos):

    # We merge the two dataframe in order to better handle them
    merged = pd.merge(left=neg, right=pos, left_on = "word", right_on = "word", suffixes=('_neg', '_pos'),  how="outer")
    merged = merged.fillna(0)


    #We only consider words whose occurences dfferences between sad and happy tweets is greater or equal than 5 
    merged["difference"] = abs((merged["occurence_neg"]-merged["occurence_pos"]))
    merged = merged[merged["difference"]>=5]

    #We compute the sum of occurences
    merged["somme"] = merged["occurence_neg"]+merged["occurence_pos"]

    #The ratio si how relevant it is to judge happyness/sadness of the tweet using the word : 0 if not relevant, 1 if truly relevant
    merged["ratio"] = 2* abs(0.5 - merged["occurence_pos"]/(merged["occurence_pos"]+merged["occurence_neg"]))
    
    #If we want to sort it
    #merged.sort_values(by = ["ratio","somme"], ascending=[False, False])
    
    return merged


merged = merging(neg, pos)
merged_full = merging(neg_full, pos_full)

In [None]:
merged.sort_values(by = ["ratio","somme"], ascending=[False, False]).head(5)

In [None]:
merged_full.sort_values(by = ["ratio","somme"], ascending=[False, False]).head()

##### From here, we realise that some words have strangly strong occurence in the negative tweets.
By seeing the words in context, we realised that some tweets occured more than once.
We checked if those words were also in the test_data that we have to classify. The check was positive.
We will therefore capture those words, (i.e. "1gb" or "cd-rom") because they are luckily to be in the test_data set and classify directly the tweets countaining does words. We will drop also all the duplicate tweets for training in order to not let on the side some other words and this will save us power computationnal efficiency.

Two example of such tweets are:

    1) 1.26 - 7x14 custom picture frame / poster frame 1.265 " wide complete cherry wood frame ( 440ch this frame is manufactu ... <url>
    
    2) misc - 50pc diamond burr set - ceramics tile glass lapidary for rotary tools ( misc . assorted shapes and sizes for your ... <url>

In [None]:
print(test_data.loc[test_data.word.str.startswith("1.26")])
print(test_data.loc[test_data.word.str.startswith("misc")])

Filter the datafram to keep only ratio = 1 and diff > `MIN_diff`

In [None]:
MIN_diff = 200  # TBD accordingly to : As soon as we see a "pos_for_sure or a neg_for_sure appearing
                # within the same tweet, Min_diff = difference + 1 of the smallest one of the two words."

mf_max_ratio = merged_full[(merged_full.ratio == 1) & (merged_full.difference >= MIN_diff)]
mf_max_ratio = mf_max_ratio[["word","difference"]]
word_max_ratio = list(mf_max_ratio.values)

def set_min_diff_and_delete_uninteressting()

type(word_max_ratio[0][1])

In [None]:
word_haha = test_data.loc[test_data.word.str.startswith("hah") | test_data.word.str.startswith("ahah")]
word_haha.head()

In [None]:
print("Occurence of the words that can be remplaced by haha = "+ str(test_data.occurence.sum()))

# We make the list of all those words that have this same semantic
word_haha_list = list(word_haha.word)

In [None]:
with open('test_data.txt', 'r') as f:
    tweets = [line.strip()[line.find(",")+1:] for line in f]     # Make sure to withdraw the "nbr", 

firstTweet = tweets[0]
firstTweet

In [None]:
firstTweet.find("doo")
firstTweet.replace("doo", "fovgho")

In [None]:
def find_and_remplace(semantics, representative, tweets):
    """Retrun the tweets countaining only the representative of a given semantic """
    
    for i, tweet in enumerate(tweets):
        for semantic in semantics:          
            if (semantic in tweet):
                tweets[i] = tweet.replace(semantic, representative)
            
    return tweets


In [None]:

tweets2 = find_and_remplace(word_haha_list, "haha", tweets)

In [None]:
tweets2[481]

In [60]:
def create_data(lower_bound):
    
    #paths of positive and negative vocabs
    path_pos = "./vocab_cut_pos_full.txt"
    path_neg = "./vocab_cut_neg_full.txt"
    
    # pos is mapping of words in happy tweets with their occurences in all happy tweets
    pos = pd.read_table(filepath_or_buffer = path_pos, header=None, names=["word"])
    pos["occurence"] = pos["word"].map(lambda x:  int(x.split()[0]))
    pos["word"] = pos["word"].map(lambda x:  x.encode('utf-8').split()[1])

    # neg is mapping of words in sad tweets with their occurences in all sad tweets
    neg = pd.read_table(filepath_or_buffer = path_neg, header=None, names=["word"])
    neg["occurence"] = neg["word"].map(lambda x:  int(x.split()[0]))
    neg["word"] = neg["word"].map(lambda x:  x.encode('utf-8').split()[1])
    
    # We merge the two dataframe in order to better handle them
    merged = pd.merge(left=neg, right=pos, left_on = "word", right_on = "word", suffixes=('_neg', '_pos'),  how="outer")
    merged = merged.fillna(0)

    #We only consider words whose occurences dfferences between sad and happy tweets is greater or equal than 5 
    merged["difference"] = abs((merged["occurence_neg"]-merged["occurence_pos"]))
    merged = merged[merged["difference"]>=5]

    #We compute the sum of occurences
    merged["somme"] = merged["occurence_neg"]+merged["occurence_pos"]

    #The ratio si how relevant it is to judge happyness/sadness of the tweet using the word : 0 if not relevant, 1 if truly relevant
    merged["ratio"] = 2* abs(0.5 - merged["occurence_pos"]/(merged["occurence_pos"]+merged["occurence_neg"]))
    
    
    def lower_ratio(x) :
        if(x["somme"]<MIN_SOMME):
            return 0
        else:
            return x["ratio"]
    
    
    #We only consider with more than 'lower_bound' occurences
    merged["ratio"] = merged.apply(lower_ratio, axis = 1) 
    
    #sort the array by ratio and then sum
    merged.sort_values(by = ["ratio","somme"], ascending=[False, False])
    
    #store the data
    filename = "relevant_vocab_full_lb="+str(lower_bound)+".txt"
    merged.to_csv(path_or_buf=filename, sep=' ')
        

In [61]:
#for lb in [50, 100, 500, 1000, 2000, 5000, 10000, 20000, 50000]:
create_data(5000)
    