In [1]:
text_chunk = ''' The Velvet Underground & Nico (1967) 
White Light/White Heat (1968) 
The Velvet Underground (1969) 
Loaded (1970) '''

In [2]:
## Write text to disk

with open('/sharedfolder/vu.txt', 'w') as file_out:
    file_out.write(text_chunk)

In [3]:
## Read text from disk

text_chunk = open('/sharedfolder/vu.txt').read()

text_chunk

' The Velvet Underground & Nico (1967) \nWhite Light/White Heat (1968) \nThe Velvet Underground (1969) \nLoaded (1970) '

In [4]:
### Convert lines of text to a list with .split('\n')

string_list = text_chunk.split('\n')

string_list

[' The Velvet Underground & Nico (1967) ',
 'White Light/White Heat (1968) ',
 'The Velvet Underground (1969) ',
 'Loaded (1970) ']

In [5]:
## Remove whitespace characters around each string with .strip() and a list comprehension

string_list = [item.strip() for item in string_list]

string_list

['The Velvet Underground & Nico (1967)',
 'White Light/White Heat (1968)',
 'The Velvet Underground (1969)',
 'Loaded (1970)']

In [6]:
## Shuffle the order of items in a list

import random

random.shuffle(string_list)

# Note that calling random.shuffle(string_list) permanently shuffles 
# the order of items in a list.

string_list

['Loaded (1970)',
 'White Light/White Heat (1968)',
 'The Velvet Underground & Nico (1967)',
 'The Velvet Underground (1969)']

In [7]:
## Alphabetize a list

sorted_list = sorted(string_list) 

# Note that sorted(string_list) does not permanently change the list's 
# order, but we can assign its output to a separate variable.

sorted_list

['Loaded (1970)',
 'The Velvet Underground & Nico (1967)',
 'The Velvet Underground (1969)',
 'White Light/White Heat (1968)']

In [8]:
string_list

['Loaded (1970)',
 'White Light/White Heat (1968)',
 'The Velvet Underground & Nico (1967)',
 'The Velvet Underground (1969)']

In [9]:
## Applying a function to each item in a list with a list comprehension

upper_list = [item.upper() for item in sorted(string_list)]

upper_list

['LOADED (1970)',
 'THE VELVET UNDERGROUND & NICO (1967)',
 'THE VELVET UNDERGROUND (1969)',
 'WHITE LIGHT/WHITE HEAT (1968)']

In [10]:
## Splitting each item in a list using spaces

list_of_lists = [item.split(' ') for item in string_list]

list_of_lists

[['Loaded', '(1970)'],
 ['White', 'Light/White', 'Heat', '(1968)'],
 ['The', 'Velvet', 'Underground', '&', 'Nico', '(1967)'],
 ['The', 'Velvet', 'Underground', '(1969)']]

In [11]:
## Extracting years as strings

years = [item.split(' ')[-1] for item in string_list]

years

['(1970)', '(1968)', '(1967)', '(1969)']

In [12]:
## Add .strip('()') to remove parentheses

years = [item.split(' ')[-1].strip('()') for item in string_list]

years

['1970', '1968', '1967', '1969']

In [13]:
## Extracting years as integers

years = [int(item.split(' ')[-1].strip('()')) for item in string_list]

years

[1970, 1968, 1967, 1969]

In [14]:
## Splitting each item in a list at ' (' to isolate titles

[item.split(' (')[0] for item in string_list]

['Loaded',
 'White Light/White Heat',
 'The Velvet Underground & Nico',
 'The Velvet Underground']

In [15]:
## Extracting titles by splitting at spaces

[' '.join(item.split(' ')[:-1]) for item in string_list]

['Loaded',
 'White Light/White Heat',
 'The Velvet Underground & Nico',
 'The Velvet Underground']

In [16]:
## Vectorize a list of strings

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

data = vectorizer.fit_transform(string_list).toarray()

data

array([[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2],
       [1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]], dtype=int64)

In [17]:
print(vectorizer.get_feature_names())

['1967', '1968', '1969', '1970', 'heat', 'light', 'loaded', 'nico', 'the', 'underground', 'velvet', 'white']


In [18]:
string_list

['Loaded (1970)',
 'White Light/White Heat (1968)',
 'The Velvet Underground & Nico (1967)',
 'The Velvet Underground (1969)']

In [19]:
## Vectorize a list of strings using the count vectorizer from above

new_vector = vectorizer.transform(['Some Velvet Sidewalk', 'The The', '1969'])

new_vector.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])