## Normalization Process


### General Lib

In [1]:
import pandas as pd
import re
import numpy as np
import contractions


# Print longer cells in pd
pd.options.display.max_colwidth = 1000
pd.options.display.width = 10000
# Print all rows
pd.options.display.max_rows = None
pd.options.display.max_columns = 1000

### Normalization Lib

In [2]:
# ”#$%&'()*+,-./:;?@[\]^_`{|}~
# string.punctuation
import string

# i'm
import contractions

# i, am, he, she, on, at
import nltk
nltk.download('stopwords')

# for stopwords
from nltk.corpus import stopwords
nltk.download('wordnet')

# for pos_tag
nltk.download('punkt_tab') 
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

# english
stopwords = stopwords.words('english')
stopwords.append("\'s")
#importing the Stemming function from nltk library
from nltk.stem import WordNetLemmatizer
#defining the object for stemming
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# stem instead of lemmanize because simple, interested - interesting similar
# lemma instead of stem because of nlp word2vec word embedding

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


### Normalization Functions

In [3]:
# contraction
# lowercase
# tokenization i am -> ['i','am']
# POS_tag
# remove punctuation
# remove stop words i, will, am, he, a, an, the..., && numbers && Adv
# lemmanization

# Helper function to map POS tag to WordNet POS
def get_wordnet_pos(tag):
    if tag.startswith('J'):  # Adjective
        return 'a'
    elif tag.startswith('V'):  # Verb
        return 'v'
    elif tag.startswith('N'):  # Noun
        return 'n'
    elif tag.startswith('R'):  # Adverb
        return 'r'
    else:
        return None

def stop_words_removal_then_lemmatize(pos_tags):
    processed_tokens = []
    for word, tag in pos_tags:
        #check punctuation                          stopwords               digits
        if word not in string.punctuation and word not in stopwords and not re.search(r'\d', word):
            # if tag not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:  # Remove adjectives & adverbs
            if tag not in ['RB', 'RBR', 'RBS']:  # Remove adverbs
                pos = get_wordnet_pos(tag)
                lemma = lemmatizer.lemmatize(word, pos) if pos else lemmatizer.lemmatize(word)
                processed_tokens.append(lemma)
                # print(f"{word}, {tag}")
    return processed_tokens


def normalize_tokenize(str):
    str = contractions.fix(str)
    str = str.lower()

    tokens = word_tokenize(str)
    pos_tags = pos_tag(tokens)

    processed_tokens = stop_words_removal_then_lemmatize(pos_tags)
    
    return processed_tokens

### Normalize User Preference

In [4]:
# Load user preference data
user_pref_df = pd.read_csv("../user_preferences.csv")
user_pref_df.head(3)

Unnamed: 0,preference
0,"I’m interested in a Master of Science in Computer Science, ideally in Berlin, where I can gain exposure to cutting-edge AI technologies."
1,A Master of Arts in Business Administration with an international focus would align perfectly with my career goals in global management.
2,"I’m looking for a Master of Science in Environmental Science in Hamburg, especially one that emphasizes sustainability and climate research."


In [None]:
user_pref_df['normalize_tokenize'] = user_pref_df['preference'].apply(lambda x: normalize_tokenize(x))
user_pref_df.head(3)



### Normalize University Data 

In [6]:
university_df = pd.read_csv("../university_data.csv")
university_df.iloc[[100, 3000, 8000]]

Unnamed: 0,program_name,program_url,university,location,duration,degreeType,language,subject,studyMode,admission_Modus,admission_Requirements,overview,teaching,researching
100,Aesthetics and Media Science,http://www.uni-oldenburg.de/nc/studium/studiengang/?id_studg=312,University of Oldenburg,Oldenburg,4 semesters,Master of Arts,German,Art Studies,full time,without admission restriction,1. Bachelor's degree or equivalent degree in the field of Arts and Media Studies or another subject-specific degree program2. at least 30 credit points for subject-related and didactic content.more information regarding admission requirements. Bachelor/Bakkalaureus,,,
3000,Economics,https://www.uni-heidelberg.de/de/studium/alle-studienfaecher/economicspolitische-oekonomik/wirtschaftswissenschaft-teilstudiengang-im-master-education,Heidelberg University,Heidelberg,4 semesters,Master of Education,German,"Economic Sciences, Economics",full time,without admission restriction,"Admission restrictions, see admission regulations\;more information regarding admission requirements. Bachelor/Bakkalaureus",,,
8000,Physics,https://www.zsb.uni-wuppertal.de/studieninfos/studieninfos/master/physik-msc.html,University of Wuppertal,Wuppertal,4 semesters,Master of Science,German,Physics,full time,without admission restriction,"Bachelor of Science' or equivalent degree in the Physics course or in a course recognised as equivalent at an institute of higher education in thearea of validity of Germany's Basic Law at least with the grade Satisfactory (3.0) orhas acquired a 'Bachelor of Science' or equivalent degree in the Physics course or in a course recognised as equivalent at an institute of higher education without or outside the area of validity of Germany's Basic Law anda) oral entrance exam lasting 20 to 40 minutes orb) the Graduate Record Examinations Subject (GRE) Test in Physics. Bachelor/Bakkalaureus(and other qualifications, provided that they are recognised as being equivalent)",,,


In [None]:
# merge: omit url, duration, admission1&2 from merge (no need)
university_df['merge_raw'] = university_df[university_df.columns[[0,2,3,5,6,7,8,11,12,13]]].apply(
    lambda x: '. '.join(x.dropna().astype(str)), axis=1
)
university_df['merge_normalize_tokenize'] = university_df['merge_raw'].apply(lambda x: normalize_tokenize(x))
university_df['merge_normalize_tokenize'].head(3)



In [None]:
user_pref_df.to_csv("../user_preferences_normalized.csv", index=False)
university_df.to_csv("../university_data_normalized.csv", index=False)