In [49]:
import pandas as pd
import numpy as np

### Reading the file from the Assignment

In [None]:
data = pd.read_csv("languages.tsv", delimiter="\t", encoding="utf-8", engine="python")

In [None]:
# filter to get columns that are needed
features = data.columns.values[10:]
filtered = data[["iso_code", "Name", "genus"]+list(features)]
filtered.dropna(how='all', inplace=True)

In [56]:
filtered.columns

Index(['iso_code', 'Name', 'genus', '1A Consonant Inventories',
       '2A Vowel Quality Inventories', '3A Consonant-Vowel Ratio',
       '4A Voicing in Plosives and Fricatives',
       '5A Voicing and Gaps in Plosive Systems', '6A Uvular Consonants',
       '7A Glottalized Consonants',
       ...
       '137B M in Second Person Singular', '136B M in First Person Singular',
       '109B Other Roles of Applied Objects',
       '10B Nasal Vowels in West Africa',
       '25B Zero Marking of A and P Arguments',
       '21B Exponence of Tense-Aspect-Mood Inflection',
       '108B Productivity of the Antipassive Construction',
       '130B Cultural Categories of Languages with Identity of 'Finger' and 'Hand'',
       '58B Number of Possessive Nouns',
       '79B Suppletion in Imperatives and Hortatives'],
      dtype='object', length=195)

In [57]:
columns = filtered.columns
infos = {}
for i in range(filtered.shape[0]):
    id, name, genus = filtered.at[i,columns[0]], filtered.at[i,columns[1]], filtered.at[i,columns[2]]
    if id is not np.NAN:
        if id not in infos:
            infos[id]={"Name": name, "Genus":genus, "Features":[]}
        for f in columns[3:]:
            if filtered.at[i,f] is not np.NAN:
                infos[id]["Features"].append((" ".join(str(f).split()), filtered.at[i,f]))
    

In [59]:
infos['deu']

{'Name': 'German (Berlin)',
 'Genus': 'Germanic',
 'Features': [('54A Distributive Numerals', '1 No distributive numerals'),
  ('1A Consonant Inventories', '3 Average'),
  ('2A Vowel Quality Inventories', '3 Large (7-14)'),
  ('3A Consonant-Vowel Ratio', '1 Low'),
  ('4A Voicing in Plosives and Fricatives',
   '4 In both plosives and fricatives'),
  ('5A Voicing and Gaps in Plosive Systems',
   '2 None missing in /p t k b d g/'),
  ('6A Uvular Consonants', '3 Uvular continuants only'),
  ('7A Glottalized Consonants', '1 No glottalized consonants'),
  ('8A Lateral Consonants', '2 /l/, no obstruent laterals'),
  ('9A The Velar Nasal', '2 No initial velar nasal'),
  ('10A Vowel Nasalization', '2 Contrast absent'),
  ('11A Front Rounded Vowels', '2 High and mid'),
  ('12A Syllable Structure', '3 Complex'),
  ('13A Tone', '1 No tones'),
  ('14A Fixed Stress Locations', '1 No fixed stress'),
  ('15A Weight-Sensitive Stress', '4 Right-oriented: One of the last three'),
  ('16A Weight Factors 

Please do not call the function below!!! This is just for showing. The result is saved in "all_languages_sm_score.pkl".

In [154]:
#compute the similar shared features scores for each language pairs, very computational expensive so I saved the result
def get_all_sm_scores(filtered_data):
    all_sim_scores = {}
    for c1 in filtered["iso_code"].values:
        for c2 in filtered["iso_code"].values:
            if c1 is not np.NAN and c2 is not np.NAN:
                if (c1, c2) not in all_sim_scores and (c2, c1) not in all_sim_scores:
                    print(c1, c2)
                    all_sim_scores[(c1, c2)] = 0
                f1, f2 = infos[c1]["Features"], infos[c2]["Features"]
                share_feats = np.sum([v == t for k, v in f1 for f, t in f2 if k==f])
                all_sim_scores[(c1, c2)] = share_feats/len(features)    
    return all_sim_scores              

In [101]:
#import pickle
#with open('all_langs_sm_scores.pkl', 'wb') as file:     
    #pickle.dump(all_sim_scores, file) 

#### Task 1
I didn't do anything fancy, basically just compared all the features with non-none values and count how many of them share within two languages and average by total type features. <br>
The result shows that this Hamming distance of features are not the best measure of language similarities given WALS features. For example, it returns the closest language to Czech is Latvian, but I would assume it's Slovak. I would probably add the geographical distance as a feature as well, closer the distance of two languages (not always), should be more similar to each other. Or use whether they belong to the same family as another feature in addition.

In [73]:
def get_most_similar_languages(iso_code):
        this_features = infos[iso_code]["Features"]
        max, similar_lang = 0, iso_code
        for code in filtered["iso_code"]:
            if code != iso_code and code is not np.NAN:               
                features = infos[code]["Features"]
                share_f = np.sum([v == t for k, v in this_features for f, t in features if k==f])
                if share_f > max:
                    max = share_f
                    similar_lang = code, infos[code]["Name"]
        print("Shared features in total:", max)
        sim_score = max/len(features)
        return similar_lang, sim_score

In [74]:
get_most_similar_languages("deu")

Shared features in total: 107


(('eng', 'English'), 3.4516129032258065)

#### Task 2
I don't have very good way to do it. What I did is to get all similarity scores of all language pairs (very computational expensive) and use it to retrive the language pair similarities within a genus and compute on average which language has the largest similarity score to all other languagesin the same genus. <br>
So it shows that this method returns plausible results. It returns Russian as the centroid of Slavic languages and German as Germanic languages, for example.

In [152]:
def get_centroid_lang(genus):
    if genus not in set(filtered["genus"].values):
        print("This genus does not exit in the current data!")
    else: 
        with open('all_langs_sm_scores.pkl', 'rb') as file: 
            all_sim_scores = pickle.load(file) 
        avg_sm_scores = {}
        codes_in_genus = filtered[filtered["genus"]==genus]["iso_code"]
        for lang1 in codes_in_genus:
            sm_score_lang1 = []
            for lang2 in codes_in_genus:
                sm_score = 0
                if (lang1, lang2) in all_sim_scores:
                    sm_score = all_sim_scores[(lang1, lang2)]
                elif (lang2, lang1) in all_sim_scores:
                    sm_score = all_sim_scores[(lang2, lang1)]
                sm_score_lang1.append(sm_score)
            if lang1 not in avg_sm_scores:
                avg_sm_scores[lang1] = np.mean(sm_score_lang1)
        return max(avg_sm_scores, key=avg_sm_scores.get), avg_sm_scores

In [153]:
get_centroid_lang("Slavic")[0], get_centroid_lang("Germanic")[0], get_centroid_lang("Chinese")

('rus',
 'deu',
 ('cmn',
  {nan: 0.0,
   'yue': 0.1546875,
   'cdo': 0.03020833333333333,
   'hak': 0.07708333333333334,
   'cmn': 0.2390625,
   'wuu': 0.03072916666666666}))

#### Task 3
In this task, I count the values of each features to extract the least common value of each feature in the data and use these (feature, value) pair to locate the languages that have this exact feature, and in the end count which language shows up most frequently in these rare cases. <br>
The result shows that the top 3 languages that have the features with the least common value (rare) that occur less than 10 times (less than 10 languages have this feature and value) across all feature_value pairs. They are [('Wichita', 6), ('Semelai', 4), ('Mandarin', 4)]. In my method, "Wichita" has the most (6) rare features, across all rare feature_values, followed by Semelai and Mandarin.

In [129]:
from collections import Counter
f_v_langs = {}
for f in features:
    least_common = Counter(filtered[f].values).most_common()[-1]
    if least_common[1] < 10:
        rare_v = least_common[0]
        rare_v_langs = filtered[filtered[f]==rare_v]['Name'].to_list()
        if (f, rare_v) not in f_v_langs:
            f_v_langs[(f, rare_v)] = rare_v_langs

f_v_langs.keys()

dict_keys([('5A Voicing and Gaps in Plosive Systems', '5 Both missing'), ('7A Glottalized Consonants', '8 Ejectives, implosives, and glottalized resonants'), ('8A Lateral Consonants', '5 No /l/, but lateral obstruents'), ('11A Front Rounded Vowels', '4 Mid only'), ('14A Fixed Stress Locations', '4 Third'), ('15A Weight-Sensitive Stress', '2 Left-oriented: One of the first three'), ('17A Rhythm Types', '3 Dual: both trochaic and iambic'), ('18A Absence of Common Consonants', '6 No fricatives or nasals'), ('19A Presence of Uncommon Consonants', "6 Clicks, pharyngeals, and 'th'"), ('20A Fusion of Selected Inflectional Formatives', '4 Tonal/isolating'), ('21A Exponence of Selected Inflectional Formatives', '4 Case + TAM'), ('22A Inflectional Synthesis of the Verb', '7 12-13 categories per word'), ('23A Locus of Marking in the Clause', '5 Other'), ('24A Locus of Marking in Possessive Noun Phrases', '5 Other'), ('25A Locus of Marking: Whole-language Typology', '4 Zero-marking'), ('33A Coding

In [132]:
l = []
for v in f_v_langs.values():
    l += v
Counter(l).most_common(3)

[('Wichita', 6), ('Semelai', 4), ('Mandarin', 4)]