*Moved CleanText_Arabic function to preprocessing.ipynb*

In [1]:
from nltk.corpus import stopwords
from textblob import TextBlob
import re
from snowballstemmer import stemmer
ar_stemmer = stemmer("arabic")

ModuleNotFoundError: No module named 'textblob'

In [None]:
training_path = "vaguely_ML_masters/data/raw/2018-Valence-oc-Ar-train.txt"
dev_path = "vaguely_ML_masters/data/raw/2018-Valence-oc-Ar-dev.txt"
test_path = "vaguely_ML_masters/data/raw/2018-Valence-oc-Ar-test.txt"

training_outpath = "vaguely_ML_masters/data/cleaned/Ar_cleaned_training.txt"
dev_outpath = "vaguely_ML_masters/data/cleaned/Ar_cleaned_dev.txt"
test_outpath = "vaguely_ML_masters/data/cleaned/Ar_cleaned_test.txt"

stopwords_path = "vaguely_ML_masters/utilities/arabic-stop-words-list.txt"

In [None]:
training_df = pd.read_table(training_path)
dev_df = pd.read_table(dev_path)
test_df = pd.read_table(test_path)

training_df

In [None]:
with open(stopwords_path, "r", encoding="utf-8") as infile: 
    stopwords = list()
    for line in infile:
        line = line.replace("\n", "")
        stopwords.append(line)

In [None]:
def adapt_valence_scores(df): 
    valence_list = list()
    for index, row in df.iterrows():
        valence = row["Intensity Class"]
        valence = valence.replace(valence, valence[:2].replace(":", ""))
        valence_list.append(valence)
    return valence_list
        
training_valence = adapt_valence_scores(training_df)
dev_valence = adapt_valence_scores(dev_df)
test_valence = adapt_valence_scores(test_df)

In [None]:
class CleanText_Arabic(BaseEstimator, TransformerMixin):
    """
    From https://towardsdatascience.com/sentiment-analysis-with-text-mining-13dd2b33de27
    """
    def remove_repeating_char(self, input_text):
        return re.sub(r'(.)\1+', r'\1\1', input_text) #keep 2 repeat
    
    def remove_mentions(self, input_text):
        return re.sub(r'@\w+', '', input_text)
    
    def remove_urls(self, input_text):
        return re.sub(r"http\S+ | www\S+" , "لينك", input_text)
    
    def remove_hashtags(self, input_text):
        return re.sub(r"#", "", input_text)
    
    def emoji_oneword(self, input_text):
        # By compressing the underscore, the emoji is kept as one word
        return input_text.replace('_','')
    
    def remove_punctuation(self, input_text):
        # Make translation table
        punct = string.punctuation
        trantab = str.maketrans(punct, len(punct)*' ')  # Every punctuation symbol will be replaced by a space
        return input_text.translate(trantab)
    
    def remove_digits(self, input_text):
        return re.sub('\d+', '', input_text)
    
    def remove_stopwords(self, input_text):
        # Some words which might indicate a certain sentiment are kept via a whitelist
        ### whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords) and len(word) > 1] 
        return " ".join(clean_words)
    
    def stem(self, input_text):
        words = input_text.split()
        stemmed_words = [ar_stemmer.stemWord(word) for word in words]
        return " ".join(stemmed_words)
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        clean_X = X.apply(self.remove_hashtags).apply(self.remove_repeating_char).apply(self.remove_mentions).apply(self.remove_urls).apply(self.emoji_oneword).apply(self.remove_punctuation).apply(self.remove_digits).apply(self.remove_stopwords).apply(self.stem)
        return clean_X

In [None]:
ct = CleanText()
training_clean = ct.fit_transform(training_df.Tweet)
dev_clean = ct.fit_transform(dev_df.Tweet)
test_clean = ct.fit_transform(test_df.Tweet)

In [None]:
training_clean

In [None]:
training_cleaned = pd.DataFrame(training_clean)
training_cleaned['Valence score']= training_valence

dev_cleaned = pd.DataFrame(dev_clean)
dev_cleaned['Valence score']= dev_valence

test_cleaned = pd.DataFrame(test_clean)
test_cleaned['Valence score']= test_valence

In [None]:
training_cleaned.to_csv(training_outpath, sep="\t")
dev_cleaned.to_csv(dev_outpath, sep="\t")
test_cleaned.to_csv(test_outpath, sep="\t")

In [None]:
training_cleaned