In [1]:
import pandas as pd
import string

In [2]:
data_url = 'data/alice-in-wonderland.txt'

alice_series = pd.Series(open(data_url).read().split())

alice_series

0           ï»¿The
1          Project
2        Gutenberg
3            EBook
4               of
           ...    
12758           to
12759         hear
12760        about
12761          new
12762      eBooks.
Length: 12763, dtype: object

> The read method returns a string containing the contents of the file. What if
the file contains several terabytes of data? I suggest that people not read an entire file into memory at once, instead iterating over its lines. In this particular case, I know that the
file is small, and that there won’t be any issues with reading it all at once.

In [3]:
# What are the 10 most common words in the book?
(
    alice_series
    # lowercase the entire series
    .str.lower()
    # Strip all punctuations
    .str.strip(string.punctuation)
    # Now count the top 10 words
    .value_counts()[:10]
)

the    807
and    404
a      328
to     327
of     318
she    237
in     227
it     183
you    171
was    168
dtype: int64

In [4]:
# count the number of capitalized words in the book

(
    alice_series[
    (alice_series.str.strip(string.punctuation)
    .str.contains('^[A-Z]\w*$', regex=True)
    )
    ]
).count()

1686

> The value will be True whenever the word starts with a capital letter (anchored to the start of the string with ^) and contains zero or more alphanumeric characters (\w*) through the end of the word.

> Count the number of vowels (a, e, i, o, and u) in each word. What is the average number
of vowels per word?

In [5]:
def count_vowels(one_word):
    total = 0
    for one_letter in one_word.lower():
        if one_letter in 'aeiou':
            total += 1
    return total

In [6]:
alice_series.apply(count_vowels).mean()

1.66379377889211

In [7]:
wine_url = 'data/winemag-150k-reviews.csv'

wine_df = pd.read_csv(wine_url, usecols=['country', 'province', 'description', 'variety'])

# Preview the data
wine_df.head()

Unnamed: 0,country,description,province,variety
0,US,This tremendous 100% varietal wine hails from ...,California,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Northern Spain,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,California,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",Oregon,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",Provence,Provence red blend


In [8]:
common_wine_words = ['flavors', 'aromas', 'finish', 'drink', 'palate']

(
    wine_df['description']
    # Turn to lowercase
    .str.lower()
    # Split each word in the sentence into individual words
    .str.split()
    # return a new series  one in which each object has its own row whilst maintaining the index
    .explode()
    # Remove all punctuations
    .str.strip(string.punctuation)
    # Remove words with length < 5 and common words
    .loc[(lambda _: (_.str.len() >= 5) & ~(_.isin(common_wine_words)))]
)[:10]

0    tremendous
0      varietal
0         hails
0      oakville
0         three
0         years
0         juicy
0    red-cherry
0         fruit
0    compelling
Name: description, dtype: object

In [9]:
def top_10_words(series: pd.Series):
    common_wine_words = ['flavors', 'aromas', 'finish', 'drink', 'palate']

    return (
                series
                # Turn to lowercase
                .str.lower()
                # Split each word in the sentence into individual words
                .str.split()
                # return a new series  one in which each object has its own row whilst maintaining the index
                .explode()
                # Remove all punctuations
                .str.strip(string.punctuation)
                # Remove words with length < 5 and common words
                .loc[(lambda _: (_.str.len() >= 5) & ~(_.isin(common_wine_words)))]
                # Return the top 10
                [:10]
          )
    
top_10_words(wine_df['description'])

0    tremendous
0      varietal
0         hails
0      oakville
0         three
0         years
0         juicy
0    red-cherry
0         fruit
0    compelling
Name: description, dtype: object

In [10]:
# 10 most common words used in French wine reviews

top_10_words(series = wine_df.loc[(wine_df['country'] == 'France'), 'description'])

4          bégude
4           named
4           after
4         highest
4           point
4        vineyard
4       structure
4         density
4    considerable
4         acidity
Name: description, dtype: object

Tasks

- Open the file so_2021_survey_results.csv, and read it into a data frame. We only
need the columns LanguageHaveWorkedWith, LanguageWantToWorkWith, Country,
and CompTotal.
- What are the different programming languages that developers currently use?
- What are the 10 programming languages most commonly used today?
- What are the 10 programming languages people most want to use?
- What languages are on both top-10 lists?
- What languages in the top 10 have people worked with, but don’t want to work with in
the future?
- What is the most popular (current) language used by people in each country?
- What is the mean number of languages used in the last year?
- What is the greatest number of languages people listed as having used in the last year?
- How many people chose that largest number?
- How many people in the survey claim salaries of $2m or above?
- Remove rows in which salaries are below $2m
- Turn the 'LanguageHaveWorkedWith' column into "dummy" columns in df, such that
each language is its own column.
- If you want to maximize your salary, and have to choose two languages from Python,
JavaScript, and Java, then what combination would be best?

In [11]:
data_url = 'data/so_2021_survey_results.csv'

so_df = pd.read_csv(data_url, usecols=['LanguageHaveWorkedWith', 'LanguageWantToWorkWith', 'Country', 'CompTotal'])

so_df.head()

Unnamed: 0,Country,CompTotal,LanguageHaveWorkedWith,LanguageWantToWorkWith
0,Slovakia,4800.0,C++;HTML/CSS;JavaScript;Objective-C;PHP;Swift,Swift
1,Netherlands,,JavaScript;Python,
2,Russian Federation,,Assembly;C;Python;R;Rust,Julia;Python;Rust
3,Austria,,JavaScript;TypeScript,JavaScript;TypeScript
4,United Kingdom of Great Britain and Northern I...,,Bash/Shell;HTML/CSS;Python;SQL,Bash/Shell;HTML/CSS;Python;SQL


In [18]:
# What different programming languages programmers use

have_worked_with = (
    so_df['LanguageHaveWorkedWith']
    # Access the string and split on the separator
    .str.split(';')
    # Now, Expand this selection
    .explode()
    # Lets count the occurrence
    .value_counts()
    # Select the top 10
    [:10]
    # Get the names of the languages
    .index
)

In [19]:
want_to_work_with = (
    so_df['LanguageWantToWorkWith']
    # Access the string and split on the separator
    .str.split(';')
    # Now, Expand this selection
    .explode()
    # Lets count the occurrence
    .value_counts()
    # Select the top 10
    [:10]
    # Get the names of the languages
    .index
)

In [21]:
# what languages are on both top-10 lists

want_to_work_with.intersection(have_worked_with)

Index(['JavaScript', 'Python', 'HTML/CSS', 'TypeScript', 'SQL', 'Node.js',
       'C#', 'Java'],
      dtype='object')

In [23]:
# What languages in the top 10 have people worked with, but don’t want
# to work with in the coming year?

have_worked_with[~(have_worked_with.isin(want_to_work_with))]

Index(['Bash/Shell', 'C++'], dtype='object')

In [24]:
# which language is most popular in each country

all_languages = (
    so_df['LanguageHaveWorkedWith']
    # Access the string and split on the separator
    .str.split(';')
    # Now, Expand this selection
    .explode()
)




In [33]:
(
    so_df[['Country']]
    # Join with the all languages series
    .join(all_languages)
    # Groupby the country column
    .groupby('Country')
    # Agg
    .agg(pd.Series.mode) 
)

Unnamed: 0_level_0,LanguageHaveWorkedWith
Country,Unnamed: 1_level_1
Afghanistan,JavaScript
Albania,JavaScript
Algeria,JavaScript
Andorra,JavaScript
Angola,"[HTML/CSS, JavaScript]"
...,...
"Venezuela, Bolivarian Republic of...",JavaScript
Viet Nam,JavaScript
Yemen,"[C#, HTML/CSS]"
Zambia,HTML/CSS


In [35]:
# find the mean number of languages that developers used in the last year

(
    so_df['LanguageHaveWorkedWith']
    # Split text 
    .str.split(';')
    # Determine the length of the list
    .str.len()
    # Find the mean
    .mean()
)

5.373678011583714

In [38]:
# find out the greatest number of languages anyone had indicated they used in the last year

(
    so_df['LanguageHaveWorkedWith']
    # Split text
    .str.split(';')
    .str.len()
    # Find the max
    .max()
)

38.0

In [41]:
(
    so_df['CompTotal']
    .loc[(so_df['CompTotal'] >= 2_000_000)]
    .count()
)

2369

> I want to take the LanguageHaveWorkedWith
column, and turn it into multiple columns. That’ll allow us to more easily analyze the individual
languages. Doing this is known as creating "dummy columns."

In [43]:
so_df = so_df.loc[(so_df['CompTotal'] < 2_000_000)]


dummy_lang = (
    so_df['LanguageHaveWorkedWith']
    .str.get_dummies(sep=';')
)


so_df = pd.concat([dummy_lang, so_df], axis='columns')

In [45]:
# First, what was the average salary of someone who knows Python and JavaScript, but not Java
(
    so_df['CompTotal'].loc[(
                        (so_df['Python'] == 1) &
                        (so_df['JavaScript'] == 1) &
                        (so_df['Java'] == 0)
                      )]
    .mean()
)

126817.99470235605

In [46]:
# What about someone who knows Python and Java, but not JavaScript?

(
    so_df['CompTotal'].loc[(
                        (so_df['Python'] == 1) &
                        (so_df['JavaScript'] == 0) &
                        (so_df['Java'] == 1)
                      )]
    .mean()
)

162737.10379596677

In [47]:
# what about someone who knows Java and JavaScript, but not Python

(
    so_df['CompTotal'].loc[(
                        (so_df['Python'] == 0) &
                        (so_df['JavaScript'] == 1) &
                        (so_df['Java'] == 1)
                      )]
    .mean()
)

140867.65981559738