In [20]:
import re
import copy
import pandas as pd
import nltk
from string import punctuation
from collections import Counter
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

class PreprocessTweet:

    def all_caps(self, tweet_word):
        '''
            Input is a string - tweet_word
            Returns boolean whether the word has all capital letters
        '''
        isuppercase = re.match(r'^[A-Z]+$', tweet_word)
        return bool(isuppercase)
    
    def repeating_letters(self, tweet_word):
        '''
            Input is a string - tweet_word
            Return boolean whether the word has letter that repeats consequently
        '''
        for i in range(len(tweet_word)-2):
            if tweet_word[i] == tweet_word[i+1] and tweet_word[i] == tweet_word[i+2]:
                return True
            
        return False
        

    def tweet_patterns(self, tweet, multiple):
        '''
            Inputs:
            - tweet of type string: it is the actual tweet
            - multiple of type boolean: it represents whether to look for 
                multiple exclamation marks and 
                multiple words in capital letters and 
                multiple repeating letters in words
            Returns three boolean variables in order:
                has_exclamation_mark, has_all_caps, has_repeating_letters
        '''
        
        has_exclamation_mark = False
        has_all_caps = False
        has_repeating_letters = False
        
        tweet_words = tweet.split()
        list_all_caps = [self.all_caps(word) for word in tweet_words]
        list_repeating_letters = [self.repeating_letters(word) for word in tweet_words]
        
        if multiple:
            
            counted_all_caps = Counter(list_all_caps)
            counted_repeating_letters = Counter(list_repeating_letters)
            counted_exclamation_marks = [Counter(word)['!'] for word in tweet_words]
            
            if counted_all_caps[True] > 1:
                has_all_caps = True
            if counted_repeating_letters[True] > 1:
                has_repeating_letters = True
            if sum(counted_exclamation_marks) > 1:
                has_exclamation_mark = True
        
        else:
            
            if True in list_all_caps:
                has_all_caps = True

            if True in list_repeating_letters:
                has_repeating_letters = True

            if "!" in tweet:
                has_exclamation_mark = True
            
        return has_exclamation_mark, has_all_caps, has_repeating_letters
    
    
    def remove_stopwords(self, tweet_df):
        stop = stopwords.words('english')
        tweet_df['stopwordsx'] = tweet_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
        return tweet_df
    
    
    def get_wordnet_pos(self, tag):

        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
        
        
    def lemmatize(self, sentences):

        tkn = TweetTokenizer()
        lemmatizer = WordNetLemmatizer()
        
        word_tokens = [tkn.tokenize(sentences[i]) for i in range(0, len(sentences))]
        words_to_lemmatize = [nltk.pos_tag(word_tokens[i]) for i in range(0, len(sentences))]
        
        lemmatized = [[lemmatizer.lemmatize(word[0], self.get_wordnet_pos(word[1])) for word in sent] for sent in words_to_lemmatize]
        
        return lemmatized
            
    
    def preprocess(self, tweet, use_stop, remove_numbers):
        '''
            Input is of type string: it is the actual tweet
            Returns a tweet of type string which is:
            - lowercased
            - without any punctuation
            - without user tags, i.e. words that start with @
            - without multiple spaces between the words
        '''
        
        tags = re.compile(r"&.*;")
        if use_stop:
            punct = re.compile(r'[{}]+'.format(re.escape(punctuation.replace("\'",""))))
        else:
            punct = re.compile(r'[{}]+'.format(re.escape(punctuation)))
        stopwords = re.compile(r'[{}]+'.format(re.escape("'")))
        non_ascii = re.compile('[^\x00-\x7F]+')
        numbers = re.compile(r'[0-9]')
        links = re.compile(r'http\S+')
        hashtags = re.compile(r'#\w+')
        user = re.compile(r"@\w+")
        spaces = re.compile(r" +")
        
        tweet = tweet.lower()
        tweet = tags.sub('', tweet).strip()
        tweet = user.sub('usertag', tweet).strip()
        tweet = hashtags.sub('', tweet).strip()
        tweet = links.sub('linktag', tweet).strip()
        if remove_numbers:
            tweet = numbers.sub('', tweet).strip()
        tweet = non_ascii.sub('', tweet).strip()
        tweet = punct.sub(' ', tweet).strip()
        if use_stop:
            tweet = stopwords.sub('', tweet)
        tweet = spaces.sub(' ', tweet)
        
        return tweet
    
    def preprocess_tweets(self, tweets, pattern, use_stop, remove_numbers):
        '''
            Inputs:
            - tweets of type DataFrame from pandas with 2 columns:
                1. Sentiment Label
                2. Tweet
            - pattern of type boolean which is used for finding 
              single/multiple characters in the function tweet_patterns
            Returns a new DataFrame from pandas with 5 columns:
                1. Sentiment Label
                2. Preprocessed Tweet
                3. Whether the tweet had single/multiple exclamation marks
                4. Whether the tweet had single/multiple words in capital letters
                5. Whether the tweet had single/multiple words that had consequently repeating charaters 
        '''
        
        data = []
        
        for raw_tweet in tweets.values:
            record = []
            mark, caps, letters = self.tweet_patterns(raw_tweet[1], pattern)
            tweet = self.preprocess(raw_tweet[1], use_stop, remove_numbers)
            
            record.append(raw_tweet[0])
            record.append(tweet)
            record.append(mark)
            record.append(caps)
            record.append(letters)
            data.append(record)
            
        df = pd.DataFrame(data, columns=['Label', 'Tweet', 'Mark', 'Caps', 'Letters'])
        
        return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
